diff options
author | Jaegeuk Kim <jaegeuk@kernel.org> | 2017-01-09 18:16:29 -0800 |
---|---|---|
committer | Jaegeuk Kim <jaegeuk@google.com> | 2017-10-03 17:19:14 -0700 |
commit | 13f002354db13a029aef84aa1a8bfa51bd6b8d56 (patch) | |
tree | 1edef6487d2b259ce5fb01aa2c0458c633bf2007 /fs | |
parent | b0fa18e1caa5390619cfb6878b1b63879908ed10 (diff) |
f2fs: catch up to v4.14-rc1
This is cherry-picked from upstrea-f2fs-stable-linux-4.4.y.
Changes include:
commit c7fd9e2b4a6876 ("f2fs: hurry up to issue discard after io interruption")
commit 603dde39653d6d ("f2fs: fix to show correct discard_granularity in sysfs")
...
commit 565f0225f95f15 ("f2fs: factor out discard command info into discard_cmd_control")
commit c4cc29d19eaf01 ("f2fs: remove batched discard in f2fs_trim_fs")
Change-Id: Icd8a85ac0c19a8aa25cd2591a12b4e9b85bdf1c5
Signed-off-by: Jaegeuk Kim <jaegeuk@google.com>
Diffstat (limited to 'fs')
35 files changed, 6796 insertions, 3395 deletions
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index f17684c48739..9f6607f17b53 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c new file mode 100644 index 000000000000..a91ed46fe503 --- /dev/null +++ b/fs/crypto/bio.c @@ -0,0 +1,143 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include <linux/pagemap.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/namei.h> +#include "fscrypt_private.h" + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + PAGE_SIZE, 0, GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(0, bio); + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2d40ab9edc9f..c7835df7e7b8 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -24,10 +24,10 @@ #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/ratelimit.h> -#include <linux/bio.h> #include <linux/dcache.h> #include <linux/namei.h> -#include <linux/fscrypto.h> +#include <crypto/aes.h> +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -44,7 +44,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -static struct workqueue_struct *fscrypt_read_workqueue; +struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; @@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -141,20 +141,15 @@ static void page_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -typedef enum { - FS_DECRYPT = 0, - FS_ENCRYPT, -} fscrypt_direction_t; - -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page, - gfp_t gfp_flags) +int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags) { struct { __le64 index; - u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; - } xts_tweak; + u8 padding[FS_IV_SIZE - sizeof(__le64)]; + } iv; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -162,6 +157,18 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + + BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE); + BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE); + iv.index = cpu_to_le64(lblk_num); + memset(iv.padding, 0, sizeof(iv.padding)); + + if (ci->ci_essiv_tfm != NULL) { + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv, + (u8 *)&iv); + } + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -174,15 +181,11 @@ static int do_page_crypto(struct inode *inode, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); - memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -202,53 +205,86 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags) { ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page, + ciphertext_page, len, offs, + gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, len, offs, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +301,13 @@ errout: EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,75 +315,17 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, + len, offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) -{ - struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(WRITE, bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; - pblk++; - } - err = 0; -errout: - fscrypt_release_ctx(ctx); - return err; -} -EXPORT_SYMBOL(fscrypt_zeroout_range); - /* * Validate dentries for encrypted directories to make sure we aren't * potentially caching stale data after a key has been added or @@ -351,7 +334,6 @@ EXPORT_SYMBOL(fscrypt_zeroout_range); static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; - struct fscrypt_info *ci; int dir_has_key, cached_with_key; if (flags & LOOKUP_RCU) @@ -363,18 +345,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - ci = d_inode(dir)->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); - dir_has_key = (ci != NULL); + dir_has_key = (d_inode(dir)->i_crypt_info != NULL); dput(dir); /* @@ -399,63 +374,6 @@ const struct dentry_operations fscrypt_d_ops = { }; EXPORT_SYMBOL(fscrypt_d_ops); -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else { - SetPageUptodate(page); - } - unlock_page(page); - } - fscrypt_release_ctx(ctx); - bio_put(bio); -} - -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); -} -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); - -void fscrypt_pullback_bio_page(struct page **page, bool restore) -{ - struct fscrypt_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct fscrypt_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - if (restore) - fscrypt_restore_control_page(bounce_page); -} -EXPORT_SYMBOL(fscrypt_pullback_bio_page); - void fscrypt_restore_control_page(struct page *page) { struct fscrypt_ctx *ctx; @@ -481,17 +399,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -520,7 +443,6 @@ fail: mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. @@ -562,6 +484,8 @@ static void __exit fscrypt_exit(void) destroy_workqueue(fscrypt_read_workqueue); kmem_cache_destroy(fscrypt_ctx_cachep); kmem_cache_destroy(fscrypt_info_cachep); + + fscrypt_essiv_cleanup(); } module_exit(fscrypt_exit); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..ad9f814fdead 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include <linux/scatterlist.h> #include <linux/ratelimit.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -209,7 +211,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,14 +229,17 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -332,14 +345,39 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, * in a directory. Consequently, a user space name cannot be mapped to * a disk-space name */ - return -EACCES; + return -ENOKEY; } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -350,7 +388,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_encryption_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; @@ -367,31 +405,43 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } if (!lookup) - return -EACCES; + return -ENOKEY; /* * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; @@ -403,12 +453,3 @@ errout: return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); - -void fscrypt_free_filename(struct fscrypt_name *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} -EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..79d79755d79b --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,107 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include <linux/fscrypt_supp.h> +#include <crypto/hash.h> + +/* Encryption parameters */ +#define FS_IV_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_128_CBC_KEY_SIZE 16 +#define FS_AES_128_CTS_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct crypto_cipher *ci_essiv_tfm; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } + +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +/* crypto.c */ +extern int fscrypt_initialize(unsigned int cop_flags); +extern struct workqueue_struct *fscrypt_read_workqueue; +extern int fscrypt_do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, + struct page *src_page, + struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); +extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags); + +/* keyinfo.c */ +extern void __exit fscrypt_essiv_cleanup(void); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..66e0728e9bbe 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,12 @@ #include <keys/user-type.h> #include <linux/scatterlist.h> -#include <linux/fscrypto.h> +#include <linux/ratelimit.h> +#include <crypto/aes.h> +#include <crypto/sha.h> +#include "fscrypt_private.h" + +static struct crypto_shash *essiv_hash_tfm; static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. + * @derived_raw_key: Derived raw key. * * Return: Zero on success; non-zero otherwise. */ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - u8 source_key[FS_AES_256_XTS_KEY_SIZE], - u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) + const struct fscrypt_key *source_key, + u8 derived_raw_key[FS_MAX_KEY_SIZE]) { int res = 0; struct skcipher_request *req = NULL; @@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], if (res < 0) goto out; - sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - FS_AES_256_XTS_KEY_SIZE, NULL); + sg_init_one(&src_sg, source_key->raw, source_key->size); + sg_init_one(&dst_sg, derived_raw_key, source_key->size); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + NULL); res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); @@ -77,28 +82,25 @@ out: static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - u8 *prefix, int prefix_size) + const char *prefix, int min_keysize) { - u8 *full_key_descriptor; + char *description; struct key *keyring_key; struct fscrypt_key *master_key; const struct user_key_payload *ukp; - int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; int res; - full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); - if (!full_key_descriptor) + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + if (!description) return -ENOMEM; - memcpy(full_key_descriptor, prefix, prefix_size); - sprintf(full_key_descriptor + prefix_size, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); - full_key_descriptor[full_key_len - 1] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - kfree(full_key_descriptor); + keyring_key = request_key(&key_type_logon, description, NULL); + kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); + down_read(&keyring_key->sem); if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING @@ -106,66 +108,68 @@ static int validate_user_key(struct fscrypt_info *crypt_info, res = -ENOKEY; goto out; } - down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; - up_read(&keyring_key->sem); goto out; } master_key = (struct fscrypt_key *)ukp->data; BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE + || master_key->size % AES_BLOCK_SIZE != 0) { printk_once(KERN_WARNING "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; - up_read(&keyring_key->sem); goto out; } - res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; - - crypt_info->ci_keyring_key = keyring_key; - return 0; + res = derive_key_aes(ctx->nonce, master_key, raw_key); out: + up_read(&keyring_key->sem); key_put(keyring_key); return res; } +static const struct { + const char *cipher_str; + int keysize; +} available_modes[] = { + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", + FS_AES_256_XTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", + FS_AES_256_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", + FS_AES_128_CBC_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", + FS_AES_128_CTS_KEY_SIZE }, +}; + static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, const char **cipher_str_ret, int *keysize_ret) { - if (S_ISREG(inode->i_mode)) { - if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { - *cipher_str_ret = "xts(aes)"; - *keysize_ret = FS_AES_256_XTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported contents encryption mode " - "%d for inode %lu\n", - ci->ci_data_mode, inode->i_ino); - return -ENOKEY; + u32 mode; + + if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { + pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", + inode->i_ino, + ci->ci_data_mode, ci->ci_filename_mode); + return -EINVAL; } - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { - *cipher_str_ret = "cts(cbc(aes))"; - *keysize_ret = FS_AES_256_CTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported filenames encryption mode " - "%d for inode %lu\n", - ci->ci_filename_mode, inode->i_ino); - return -ENOKEY; + if (S_ISREG(inode->i_mode)) { + mode = ci->ci_data_mode; + } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + mode = ci->ci_filename_mode; + } else { + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return -EINVAL; } - pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", - (inode->i_mode & S_IFMT), inode->i_ino); - return -ENOKEY; + *cipher_str_ret = available_modes[mode].cipher_str; + *keysize_ret = available_modes[mode].keysize; + return 0; } static void put_crypt_info(struct fscrypt_info *ci) @@ -173,12 +177,78 @@ static void put_crypt_info(struct fscrypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + desc->flags = 0; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +void __exit fscrypt_essiv_cleanup(void) +{ + crypto_free_shash(essiv_hash_tfm); +} + +int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,30 +258,24 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; - res = fscrypt_initialize(); + if (inode->i_crypt_info) + return 0; + + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; -retry: - crypt_info = ACCESS_ONCE(inode->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - fscrypt_put_encryption_info(inode, crypt_info); - goto retry; - } - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode)) + if (!fscrypt_dummy_context_enabled(inode) || + inode->i_sb->s_cop->is_encrypted(inode)) return res; + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); } else if (res != sizeof(ctx)) { return -EINVAL; } @@ -230,7 +294,7 @@ retry: crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; + crypt_info->ci_essiv_tfm = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -247,20 +311,12 @@ retry: if (!raw_key) goto out; - if (fscrypt_dummy_context_enabled(inode)) { - memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); - goto got_key; - } - - res = validate_user_key(crypt_info, &ctx, raw_key, - FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, + keysize); if (res && inode->i_sb->s_cop->key_prefix) { - u8 *prefix = NULL; - int prefix_size, res2; - - prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); - res2 = validate_user_key(crypt_info, &ctx, raw_key, - prefix, prefix_size); + int res2 = validate_user_key(crypt_info, &ctx, raw_key, + inode->i_sb->s_cop->key_prefix, + keysize); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -269,30 +325,35 @@ retry: } else if (res) { goto out; } -got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); + pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", + __func__, res, inode->i_ino); goto out; } crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + /* + * if the provided key is longer than keysize, we use the first + * keysize bytes of the derived key only + */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; - kzfree(raw_key); - raw_key = NULL; - if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { - put_crypt_info(crypt_info); - goto retry; + if (S_ISREG(inode->i_mode) && + crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { + res = init_essiv_generator(crypt_info, raw_key, keysize); + if (res) { + pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", + __func__, res, inode->i_ino); + goto out; + } } - return 0; - + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; @@ -300,6 +361,7 @@ out: kzfree(raw_key); return res; } +EXPORT_SYMBOL(fscrypt_get_encryption_info); void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) { @@ -317,17 +379,3 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) put_crypt_info(ci); } EXPORT_SYMBOL(fscrypt_put_encryption_info); - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *ci = inode->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); - return 0; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6865663aac69..9914d51dff86 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,76 +10,37 @@ #include <linux/random.h> #include <linux/string.h> -#include <linux/fscrypto.h> #include <linux/mount.h> - -static int inode_has_encryption_context(struct inode *inode) -{ - if (!inode->i_sb->s_cop->get_context) - return 0; - return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); -} +#include "fscrypt_private.h" /* - * check whether the policy is consistent with the encryption context - * for the inode + * check whether an encryption policy is consistent with an encryption context */ -static int is_encryption_context_consistent_with_policy(struct inode *inode, +static bool is_encryption_context_consistent_with_policy( + const struct fscrypt_context *ctx, const struct fscrypt_policy *policy) { - struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->get_context) - return 0; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); + return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx->flags == policy->flags) && + (ctx->contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx->filenames_encryption_mode == + policy->filenames_encryption_mode); } static int create_encryption_context_from_policy(struct inode *inode, const struct fscrypt_policy *policy) { struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - - if (inode->i_sb->s_cop->prepare_context) { - res = inode->i_sb->s_cop->prepare_context(inode); - if (res) - return res; - } ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); - if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); - return -EINVAL; - } - - if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) return -EINVAL; - } if (policy->flags & ~FS_POLICY_FLAGS_VALID) return -EINVAL; @@ -93,16 +54,20 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + struct fscrypt_context ctx; + + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -111,22 +76,23 @@ int fscrypt_process_policy(struct file *filp, inode_lock(inode); - if (!inode_has_encryption_context(inode)) { + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) - ret = -EINVAL; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; + ret = -ENOTDIR; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); - } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { - printk(KERN_WARNING - "%s: Policy inconsistent with encryption context\n", - __func__); - ret = -EINVAL; + &policy); + } else if (ret == sizeof(ctx) && + is_encryption_context_consistent_with_policy(&ctx, + &policy)) { + /* The file already uses the same encryption policy. */ + ret = 0; + } else if (ret >= 0 || ret == -ERANGE) { + /* The file already uses a different encryption policy. */ + ret = -EEXIST; } inode_unlock(inode); @@ -134,49 +100,94 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0 && res != -ERANGE) + return res; if (res != sizeof(ctx)) - return -ENODATA; + return -EINVAL; if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) + return 1; - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -185,17 +196,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) + return 0; + + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); @@ -204,9 +230,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); * @parent: Parent inode from which the context is inherited. * @child: Child inode that inherits the context from @parent. * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info + * @preload: preload child i_crypt_info if true * - * Return: Zero on success, non-zero otherwise + * Return: 0 on success, -errno on failure */ int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) @@ -215,9 +241,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; @@ -227,19 +250,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return -ENOKEY; ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - if (fscrypt_dummy_context_enabled(parent)) { - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE); - } + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ca949ea7c02f..a0dc559b1b47 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o -f2fs-y += shrinker.o extent_cache.o +f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a45d1f4b7b0f..112f8e04c549 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -210,15 +210,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (acl && !ipage) { + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; @@ -236,7 +237,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); - return (int)PTR_ERR(value); + return PTR_ERR(value); } } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 45ef3b6bfb04..e86f67ac96c6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -31,7 +31,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); } /* @@ -163,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, + .in_list = false, }; struct blk_plug plug; @@ -208,12 +209,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; - fio.old_blkaddr = fio.new_blkaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_bio(&fio); f2fs_put_page(page, 0); } out: - f2fs_submit_merged_bio(sbi, META, READ); blk_finish_plug(&plug); return blkno - start; } @@ -232,8 +231,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -246,16 +246,17 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, META); unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, META); return 0; @@ -264,23 +265,33 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -292,7 +303,7 @@ skip_write: } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -343,7 +354,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -357,7 +368,7 @@ continue_unlock: } stop: if (nwritten) - f2fs_submit_merged_bio(sbi, type, WRITE); + f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); @@ -494,6 +505,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); + f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } #endif @@ -566,7 +578,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } @@ -577,11 +589,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -597,14 +622,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -676,14 +708,13 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block - + crc_offset))); + crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; @@ -816,7 +847,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -874,6 +907,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? @@ -896,14 +930,30 @@ retry: inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { + unsigned long cur_ino = inode->i_ino; + + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + } else { + ino = cur_ino; + } } else { /* * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; @@ -941,6 +991,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -964,33 +1027,47 @@ retry_flush_dents: err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; + cond_resched(); goto retry_flush_dents; } + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. + */ + down_write(&sbi->node_change); + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) goto out; + cond_resched(); goto retry_flush_dents; } - /* - * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. - */ retry_flush_nodes: down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -999,8 +1076,6 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - - build_free_nids(sbi, false); f2fs_unlock_all(sbi); } @@ -1023,15 +1098,24 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); + + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); - spin_lock(&sbi->cp_lock); + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); - if (cpc->reason == CP_UMOUNT) + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1047,15 +1131,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1067,19 +1150,16 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1098,18 +1178,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -1138,6 +1214,27 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + + /* Flush all the NAT BITS pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + } + } + /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) @@ -1187,7 +1284,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1226,8 +1323,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1246,10 +1343,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { if (!exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; @@ -1274,7 +1371,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi); + flush_nat_entries(sbi, cpc); flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ @@ -1287,7 +1384,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c deleted file mode 100644 index 18595d7a0efc..000000000000 --- a/fs/f2fs/crypto_key.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * linux/fs/f2fs/crypto_key.c - * - * Copied from linux/fs/f2fs/crypto_key.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions for f2fs - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ -#include <keys/encrypted-type.h> -#include <keys/user-type.h> -#include <linux/random.h> -#include <linux/scatterlist.h> -#include <uapi/linux/keyctl.h> -#include <crypto/hash.h> -#include <linux/f2fs_fs.h> - -#include "f2fs.h" -#include "xattr.h" - -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct f2fs_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - -/** - * f2fs_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. - * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. - * - * Return: Zero on success; non-zero otherwise. - */ -static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], - char source_key[F2FS_AES_256_XTS_KEY_SIZE], - char derived_key[F2FS_AES_256_XTS_KEY_SIZE]) -{ - int res = 0; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, - 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - ablkcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); - res = crypto_ablkcipher_setkey(tfm, deriving_key, - F2FS_AES_128_ECB_KEY_SIZE); - if (res < 0) - goto out; - - sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, - F2FS_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } -out: - if (req) - ablkcipher_request_free(req); - if (tfm) - crypto_free_ablkcipher(tfm); - return res; -} - -static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) -{ - if (!ci) - return; - - crypto_free_ablkcipher(ci->ci_ctfm); - kmem_cache_free(f2fs_crypt_info_cachep, ci); -} - -void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *prev; - - if (ci == NULL) - ci = ACCESS_ONCE(fi->i_crypt_info); - if (ci == NULL) - return; - prev = cmpxchg(&fi->i_crypt_info, ci, NULL); - if (prev != ci) - return; - - f2fs_free_crypt_info(ci); -} - -int f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *crypt_info; - char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct f2fs_encryption_key *master_key; - struct f2fs_encryption_context ctx; - const struct user_key_payload *ukp; - struct crypto_ablkcipher *ctfm; - const char *cipher_str; - char raw_key[F2FS_MAX_KEY_SIZE]; - char mode; - int res; - - if (fi->i_crypt_info) - return 0; - - res = f2fs_crypto_initialize(); - if (res) - return res; - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res < 0) - return res; - else if (res != sizeof(ctx)) - return -EINVAL; - res = 0; - - crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - crypt_info->ci_ctfm = NULL; - memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, - sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "f2fs: unsupported key mode %d (ino %u)\n", - mode, (unsigned) inode->i_ino); - res = -ENOKEY; - goto out; - } - - memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE, - "%*phN", F2FS_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - BUG_ON(keyring_key->type != &key_type_logon); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { - res = -EINVAL; - goto out; - } - master_key = (struct f2fs_encryption_key *)ukp->data; - BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE != - F2FS_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE); - res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, - raw_key); - if (res) - goto out; - - ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); - goto out; - } - crypt_info->ci_ctfm = ctfm; - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(ctfm, raw_key, - f2fs_encryption_key_size(mode)); - if (res) - goto out; - - if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) == NULL) - crypt_info = NULL; -out: - if (res == -ENOKEY && !S_ISREG(inode->i_mode)) - res = 0; - key_put(keyring_key); - f2fs_free_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); - return res; -} - -int f2fs_has_encryption_key(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - - return (fi->i_crypt_info != NULL); -} diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c deleted file mode 100644 index 884f3f0fe29d..000000000000 --- a/fs/f2fs/crypto_policy.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * copied from linux/fs/ext4/crypto_policy.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility. - * - * This contains encryption policy functions for f2fs with some modifications - * to support f2fs-specific xattr APIs. - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#include <linux/random.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/f2fs_fs.h> - -#include "f2fs.h" -#include "xattr.h" - -static int f2fs_inode_has_encryption_context(struct inode *inode) -{ - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL); - return (res > 0); -} - -/* - * check whether the policy is consistent with the encryption context - * for the inode - */ -static int f2fs_is_encryption_context_consistent_with_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL); - - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); -} - -static int f2fs_create_encryption_context_from_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - - if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); - return -EINVAL; - } - - if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } - - if (policy->flags & ~F2FS_POLICY_FLAGS_VALID) - return -EINVAL; - - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - - return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL, XATTR_CREATE); -} - -int f2fs_process_policy(const struct f2fs_encryption_policy *policy, - struct inode *inode) -{ - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (policy->version != 0) - return -EINVAL; - - if (!S_ISDIR(inode->i_mode)) - return -EINVAL; - - if (!f2fs_inode_has_encryption_context(inode)) { - if (!f2fs_empty_dir(inode)) - return -ENOTEMPTY; - return f2fs_create_encryption_context_from_policy(inode, - policy); - } - - if (f2fs_is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; -} - -int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res; - - if (!f2fs_encrypted_inode(inode)) - return -ENODATA; - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res != sizeof(ctx)) - return -ENODATA; - if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - return 0; -} - -int f2fs_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child) -{ - const struct f2fs_crypt_info *parent_ci, *child_ci; - struct f2fs_encryption_context parent_ctx, child_ctx; - int res; - - /* No restrictions on file types which are never encrypted */ - if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && - !S_ISLNK(child->i_mode)) - return 1; - - /* No restrictions if the parent directory is unencrypted */ - if (!f2fs_encrypted_inode(parent)) - return 1; - - /* Encrypted directories must not contain unencrypted files */ - if (!f2fs_encrypted_inode(child)) - return 0; - - /* - * Both parent and child are encrypted, so verify they use the same - * encryption policy. Compare the fscrypt_info structs if the keys are - * available, otherwise retrieve and compare the fscrypt_contexts. - * - * Note that the fscrypt_context retrieval will be required frequently - * when accessing an encrypted directory tree without the key. - * Performance-wise this is not a big deal because we already don't - * really optimize for file access without the key (to the extent that - * such access is even possible), given that any attempted access - * already causes a fscrypt_context retrieval and keyring search. - * - * In any case, if an unexpected error occurs, fall back to "forbidden". - */ - - res = f2fs_get_encryption_info(parent); - if (res) - return 0; - res = f2fs_get_encryption_info(child); - if (res) - return 0; - parent_ci = F2FS_I(parent)->i_crypt_info; - child_ci = F2FS_I(child)->i_crypt_info; - if (parent_ci && child_ci) { - return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == - child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags); - } - - res = f2fs_getxattr(parent, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &parent_ctx, sizeof(parent_ctx), NULL); - if (res != sizeof(parent_ctx)) - return 0; - - res = f2fs_getxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &child_ctx, sizeof(child_ctx), NULL); - if (res != sizeof(child_ctx)) - return 0; - - return memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode) && - (parent_ctx.flags == child_ctx.flags); -} - -/** - * f2fs_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * - * Return: Zero on success, non-zero otherwise - */ -int f2fs_inherit_context(struct inode *parent, struct inode *child, - struct page *ipage) -{ - struct f2fs_encryption_context ctx; - struct f2fs_crypt_info *ci; - int res; - - res = f2fs_get_encryption_info(parent); - if (res < 0) - return res; - - ci = F2FS_I(parent)->i_crypt_info; - BUG_ON(ci == NULL); - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE); - - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), ipage, XATTR_CREATE); -} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4ac72a3f920a..c8583d7a1845 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -56,8 +56,10 @@ static void f2fs_read_end_io(struct bio *bio) int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + f2fs_show_injection_info(FAULT_IO); bio->bi_error = -EIO; + } #endif if (f2fs_bio_encrypted(bio)) { @@ -223,7 +225,7 @@ submit_io: trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); - submit_bio(0, bio); + submit_bio(bio_op(bio), bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -244,8 +246,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, - struct page *page, nid_t ino) +static bool __has_merged_page(struct f2fs_bio_info *io, + struct inode *inode, nid_t ino, pgoff_t idx) { struct bio_vec *bvec; struct page *target; @@ -254,7 +256,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!io->bio) return false; - if (!inode && !page && !ino) + if (!inode && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -264,10 +266,11 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, else target = fscrypt_control_page(bvec->bv_page); + if (idx != target->index) + continue; + if (inode && inode == target->mapping->host) return true; - if (page && page == target) - return true; if (ino && ino == ino_of_node(target)) return true; } @@ -276,70 +279,88 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) + nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - bool ret; + enum temp_type temp; + struct f2fs_bio_info *io; + bool ret = false; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + io = sbi->write_io[btype] + temp; - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); - up_read(&io->io_rwsem); + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, ino, idx); + up_read(&io->io_rwsem); + + /* TODO: use HOT temp only for meta pages now. */ + if (ret || btype == META) + break; + } return ret; } -static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io; - - io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, page, ino)) - goto out; - /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_FUA; + io->fio.op_flags |= WRITE_FLUSH | REQ_FUA; } __submit_merged_bio(io); -out: up_write(&io->io_rwsem); } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw) +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, bool force) +{ + enum temp_type temp; + + if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + return; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } +} + +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); + __submit_merged_write_cond(sbi, NULL, 0, 0, type, true); } -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type) { - if (has_merged_page(sbi, inode, page, ino, type)) - __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); + __submit_merged_write_cond(sbi, inode, ino, idx, type, false); } -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); } /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { @@ -360,19 +381,35 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } -int f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->op); + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; int err = 0; - io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + f2fs_bug_on(sbi, is_read_io(fio->op)); + + down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out_fail; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); @@ -380,10 +417,10 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (!is_read) - inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + /* set submitted = 1 as a return value */ + fio->submitted = 1; - down_write(&io->io_rwsem); + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || @@ -398,32 +435,86 @@ alloc_new: goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, - BIO_MAX_PAGES, is_read); + BIO_MAX_PAGES, false); io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); + + trace_f2fs_submit_page_write(fio->page, fio); + + if (fio->in_list) + goto next; out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(fio->page, fio); return err; } +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -451,14 +542,15 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); @@ -466,8 +558,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -509,7 +601,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei; + struct extent_info ei = {0,0,0}; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -526,18 +618,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -578,9 +660,7 @@ got_it: return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -709,23 +789,25 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; pgoff_t fofs; blkcnt_t count = 1; + int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA); + &sum, CURSEG_WARM_DATA, NULL, false); set_data_blkaddr(dn); /* update i_size */ @@ -739,7 +821,7 @@ alloc: static inline bool __force_buffered_io(struct inode *inode, int rw) { - return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + return (f2fs_encrypted_file(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } @@ -750,6 +832,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -768,7 +853,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -778,6 +863,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + /* * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with * f2fs_map_blocks structure. @@ -798,7 +898,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei; + struct extent_info ei = {0,0,0}; block_t blkaddr; if (!maxblocks) @@ -820,7 +920,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -842,7 +942,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { @@ -862,7 +962,7 @@ next_block: } if (err) goto sync_out; - map->m_flags = F2FS_MAP_NEW; + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { if (flag == F2FS_GET_BLOCK_BMAP) { @@ -930,7 +1030,7 @@ skip: f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -939,7 +1039,7 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -962,7 +1062,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = map.m_len << inode->i_blkbits; + bh->b_size = (u64)map.m_len << inode->i_blkbits; } return err; } @@ -979,7 +1079,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1085,35 +1185,6 @@ out: return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct bio *bio; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1142,9 +1213,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) @@ -1177,7 +1249,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: @@ -1208,12 +1280,11 @@ submit_and_realloc: bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1273,17 +1344,84 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_file(inode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (valid_ipu_blkaddr(fio)) { + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; + } + } + + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1292,57 +1430,57 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } +got_it: + /* + * If current allocation needs SSR, + * it had better in-place writes for updated data. + */ + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; + set_page_writeback(page); + f2fs_put_dnode(&dn); + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); + set_inode_flag(inode, FI_UPDATE_WRITE); + return err; + } - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; goto out_writepage; } + fio->need_lock = LOCK_REQ; } + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + set_page_writeback(page); - /* - * If current allocation needs SSR, - * it had better in-place writes for updated data. - */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { - rewrite_data_page(fio); - set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); - } + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) +static int __write_data_page(struct page *page, bool *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1358,12 +1496,19 @@ static int f2fs_write_data_page(struct page *page, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .submitted = false, + .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (page->index < end_index) goto write; @@ -1377,8 +1522,6 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_SIZE); write: - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; /* we should not write 0'th page having journal header */ @@ -1395,6 +1538,7 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); goto done; } @@ -1403,16 +1547,26 @@ write: need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; - f2fs_lock_op(sbi); - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, page); - if (err == -EAGAIN) + if (!err) + goto out; + } + + if (err == -EAGAIN) { err = do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = do_write_data_page(&fio); + } + } if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; @@ -1423,15 +1577,23 @@ out: ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); + submitted = NULL; } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, DATA, WRITE); + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, DATA); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; return 0; @@ -1443,13 +1605,20 @@ redirty_out: return err; } +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_data_page(page, NULL, wbc, FS_DATA_IO); +} + /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1459,13 +1628,19 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; + pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; - int nwritten = 0; pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1499,6 +1674,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (page->index > end) { done = 1; @@ -1506,7 +1682,7 @@ retry: } done_index = page->index; - +retry_write: lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1532,7 +1708,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = mapping->a_ops->writepage(page, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1542,16 +1718,27 @@ continue_unlock: unlock_page(page); ret = 0; continue; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, + HZ/50); + goto retry_write; + } + continue; } done_index = page->index + 1; done = 1; break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1569,15 +1756,16 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (nwritten) - f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + 0, last_idx, DATA); return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1592,6 +1780,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1601,15 +1793,20 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. @@ -1624,14 +1821,26 @@ skip_write: return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); if (to > i_size) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); truncate_blocks(inode, i_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -1644,19 +1853,20 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err = 0; /* * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && + !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1670,7 +1880,7 @@ restart: set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) @@ -1692,7 +1902,8 @@ restart: err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1706,7 +1917,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } @@ -1745,7 +1956,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } repeat: - page = grab_cache_page_write_begin(mapping, index, flags); + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = grab_cache_page(mapping, index); if (!page) { err = -ENOMEM; goto fail; @@ -1772,8 +1987,8 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -1787,21 +2002,9 @@ repeat: zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); - goto fail; - } - bio->bi_rw = READ_SYNC; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1914,10 +2117,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } if (trace_android_fs_dataread_start_enabled() && @@ -1955,7 +2161,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); @@ -2064,8 +2270,12 @@ int f2fs_migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written && !mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; + if (atomic_written) { + if (mode != MIGRATE_SYNC) + return -EBUSY; + if (!mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + } /* * A reference is expected if PagePrivate set when move mapping, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index cd338ca24941..87f449845f5f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -51,9 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -64,6 +81,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); @@ -78,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -91,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -116,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -148,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; @@ -185,6 +209,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); get_cache: si->cache_mem = 0; @@ -196,6 +224,11 @@ get_cache: /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) { + si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -256,8 +289,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Orphan Inode: %u\n", - si->orphans); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -316,10 +349,16 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", - si->nr_wb_cp_data, si->nr_wb_data); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt); + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -332,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", - si->free_nids, si->alloc_nids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -419,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4436079dbf0c..4f2a8fedb313 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == cpu_to_le32(fname->hash)) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; if (max_slots && max_len > *max_slots) @@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -250,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } @@ -268,7 +256,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { - *res_page = ERR_PTR(err); + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); return NULL; } @@ -330,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -377,7 +350,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -431,15 +404,19 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { - file_lost_pino(inode); + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. @@ -535,7 +512,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -545,8 +522,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; + } #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; @@ -590,11 +569,9 @@ add_dentry: err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -643,14 +620,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); - + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + } fscrypt_free_filename(&fname); return err; } @@ -708,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -721,7 +720,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, @@ -738,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); @@ -882,7 +886,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 6ed6424807b6..ff2352a0ed15 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,179 @@ #include "node.h" #include <trace/events/f2fs.h> +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -77,7 +250,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) struct extent_tree *et; nid_t ino = inode->i_ino; - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); @@ -94,7 +267,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) atomic_dec(&sbi->total_zombie_tree); list_del_init(&et->list); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); /* never died until evict_inode */ F2FS_I(inode)->extent_tree = et; @@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -177,7 +320,7 @@ static void __drop_largest_extent(struct inode *inode, } /* return true, if inode page is changed */ -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; @@ -215,6 +358,16 @@ out: return false; } +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + bool ret = __f2fs_init_extent_tree(inode, i_ext); + + if (!F2FS_I(inode)->extent_tree) + set_inode_flag(inode, FI_NO_EXTENT); + + return ret; +} + static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { @@ -237,17 +390,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,83 +416,6 @@ out: return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -387,17 +470,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -413,7 +486,7 @@ do_insert: return en; } -static unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static void f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -426,7 +499,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int pos = (unsigned int)fofs; if (!et) - return false; + return; trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); @@ -434,7 +507,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); - return false; + return; } prev = et->largest; @@ -447,8 +520,11 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, - &insert_p, &insert_parent); + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false); if (!en) en = next_en; @@ -531,8 +607,6 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __free_extent_tree(sbi, et); write_unlock(&et->lock); - - return !__is_extent_same(&prev, &et->largest); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) @@ -548,7 +622,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!atomic_read(&sbi->total_zombie_tree)) goto free_node; - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ @@ -570,11 +644,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) goto unlock_out; cond_resched(); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); @@ -604,7 +678,7 @@ free_node: spin_unlock(&sbi->extent_lock); unlock_out: - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); @@ -651,10 +725,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); list_add_tail(&et->list, &sbi->zombie_list); atomic_inc(&sbi->total_zombie_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); return; } @@ -662,12 +736,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) node_cnt = f2fs_destroy_extent_node(inode); /* delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); atomic_dec(&sbi->total_ext_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -714,7 +788,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, void init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); + mutex_init(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); atomic_set(&sbi->total_ext_tree, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 167c5f841b5f..ff694127243a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,7 +22,12 @@ #include <linux/vmalloc.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/fscrypto.h> +#include <linux/quotaops.h> +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#include <linux/fscrypt_supp.h> +#else +#include <linux/fscrypt_notsupp.h> +#endif #include <crypto/hash.h> #include <linux/writeback.h> @@ -47,6 +52,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, @@ -59,7 +65,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -84,10 +90,14 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -103,15 +113,19 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_SET_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) #define F2FS_CLEAR_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) /* bio stuffs */ #define REQ_OP_READ READ @@ -187,19 +201,22 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 -#define DEF_BATCHED_TRIM_SECTIONS 2 +#define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -241,19 +258,73 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ +}; + +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + +enum { + D_PREP, + D_SUBMIT, + D_DONE, +}; + +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ }; struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; struct list_head list; /* command list */ struct completion wait; /* compleation */ - block_t lstart; /* logical start address */ - block_t len; /* length */ - struct bio *bio; /* bio */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + int error; /* bio error */ +}; + +struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ + struct list_head wait_list; /* store on-flushing entries */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ + struct mutex cmd_lock; + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -264,13 +335,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -278,6 +349,7 @@ struct fsync_inode_entry { static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } @@ -285,6 +357,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } @@ -310,11 +383,17 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -339,6 +418,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_gc_range { + u32 sync; + u64 start; + u64 len; +}; + struct f2fs_defragment { u64 start; u64 len; @@ -351,36 +436,68 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, void *t) +{ + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + + d->inode = inode; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -412,16 +529,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; @@ -455,12 +586,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -502,16 +634,29 @@ struct f2fs_inode_info { atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ - unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -538,11 +683,22 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } -static inline bool __is_extent_same(struct extent_info *ei1, - struct extent_info *ei2) +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) { - return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && - ei1->len == ei2->len); + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); } static inline bool __is_extent_mergeable(struct extent_info *back, @@ -564,7 +720,7 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { @@ -596,6 +752,7 @@ struct f2fs_nm_info { struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -603,9 +760,17 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ + unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ #ifdef CONFIG_F2FS_CHECK_FS char *nat_bitmap_mir; /* NAT bitmap mirror */ #endif @@ -676,7 +841,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -699,12 +865,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for small discard management */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - struct list_head discard_cmd_list; /* discard cmd list */ - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ - /* for batched trimming */ unsigned int trim_sections; /* # of sections to trim */ @@ -713,9 +873,13 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; + + /* for discard command control */ + struct discard_cmd_control *dcc_info; }; /* @@ -760,29 +924,68 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, }; +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct list_head list; /* serialize IOs */ + bool submitted; /* indicate IO submission */ + int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ struct rw_semaphore io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ }; #define FDEV(i) (sbi->devs[i]) @@ -830,10 +1033,6 @@ enum { MAX_TIME, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 -#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -841,11 +1040,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ -#ifdef CONFIG_F2FS_FS_ENCRYPTION - u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; -#endif - #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ @@ -859,9 +1053,9 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info read_io; /* for read bios */ - struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; + /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ @@ -873,6 +1067,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -888,7 +1083,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ @@ -918,6 +1113,8 @@ struct f2fs_sb_info { block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + u32 s_next_generation; /* for NFS support */ /* # of pages, see count_type */ @@ -925,6 +1122,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; @@ -935,6 +1135,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* threshold for converting bg victims for fg */ + u64 fggc_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -955,13 +1158,19 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; @@ -980,13 +1189,26 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(type) \ + printk("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, fault_name[type], \ + __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { struct f2fs_fault_info *ffi = &sbi->fault_info; @@ -1000,10 +1222,6 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); return true; } return false; @@ -1014,8 +1232,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -1068,6 +1286,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -1168,6 +1407,12 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); @@ -1191,9 +1436,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) @@ -1207,9 +1454,34 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + kfree(NM_I(sbi)->nat_bits); + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1217,6 +1489,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) down_read(&sbi->cp_rwsem); } +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + return down_read_trylock(&sbi->cp_rwsem); +} + static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { up_read(&sbi->cp_rwsem); @@ -1245,7 +1522,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) @@ -1266,17 +1543,14 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 - /* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { - if (F2FS_I(inode)->i_xattr_nid) - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; - else - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1284,15 +1558,24 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); -static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { - blkcnt_t diff; + blkcnt_t diff = 0, release = 0; + block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_BLOCK)) - return false; + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + release = *count; + goto enospc; + } #endif /* * let's increase this in prior to actual block count change in order @@ -1302,32 +1585,42 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { - diff = sbi->total_valid_block_count - sbi->user_block_count; + avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; - sbi->total_valid_block_count = sbi->user_block_count; + release = diff; + sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); - return false; + goto enospc; } } spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, *count, true); - return true; + if (release) + dquot_release_reservation_block(inode, release); + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + dquot_release_reservation_block(inode, release); + return -ENOSPC; } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, - blkcnt_t count) + block_t count) { + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(sbi, inode->i_blocks < count); + f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, count, false); + f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1456,51 +1749,70 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } -static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; + bool quota = inode && !is_inode; + + if (quota) { + int ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + } spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count > sbi->user_block_count)) { + if (unlikely(valid_block_count + sbi->reserved_blocks > + sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } - if (inode) - f2fs_i_blocks_write(inode, 1, true); - sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + percpu_counter_inc(&sbi->alloc_valid_block_count); - return true; + return 0; + +enospc: + if (quota) + dquot_release_reservation_block(inode, 1); + return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, !sbi->total_valid_block_count); f2fs_bug_on(sbi, !sbi->total_valid_node_count); - f2fs_bug_on(sbi, !inode->i_blocks); + f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); + + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false, true); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) @@ -1528,11 +1840,14 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, { #ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page = find_lock_page(mapping, index); + if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); return NULL; + } #endif if (!for_write) return grab_cache_page(mapping, index); @@ -1611,22 +1926,42 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, static inline bool IS_INODE(struct page *page) { struct f2fs_node *p = F2FS_NODE(page); + return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); + raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1689,6 +2024,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1715,6 +2064,10 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1768,13 +2121,21 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) } static inline void f2fs_i_blocks_write(struct inode *inode, - blkcnt_t diff, bool add) + block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); - inode->i_blocks = add ? inode->i_blocks + diff : - inode->i_blocks - diff; + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); @@ -1826,6 +2187,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -1842,6 +2205,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -1852,13 +2222,14 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1876,12 +2247,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -1917,10 +2282,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); - return (void *)&(ri->i_addr[1]); + int extra_size = get_extra_isize(inode); + + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2003,13 +2370,15 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(FAULT_KMALLOC); return NULL; + } #endif return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +static inline void *kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -2019,7 +2388,7 @@ static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) return ret; } -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +static inline void *kvzalloc(size_t size, gfp_t flags) { void *ret; @@ -2029,42 +2398,79 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) return ret; } +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} /* * file.c */ -int f2fs_sync_file(struct file *, loff_t, loff_t, int); -void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *); -int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -int f2fs_setattr(struct dentry *, struct iattr *); -int truncate_hole(struct inode *, pgoff_t, pgoff_t); -int truncate_data_blocks_range(struct dnode_of_data *, int); -long f2fs_ioctl(struct file *, unsigned int, unsigned long); -long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void truncate_data_blocks(struct dnode_of_data *dn); +int truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * inode.c */ -void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget(struct super_block *, unsigned long); -struct inode *f2fs_iget_retry(struct super_block *, unsigned long); -int try_to_free_nats(struct f2fs_sb_info *, int); -int update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); -int f2fs_write_inode(struct inode *, struct writeback_control *); -void f2fs_evict_inode(struct inode *); -void handle_failed_inode(struct inode *); +void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +int update_inode(struct inode *inode, struct page *node_page); +int update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void handle_failed_inode(struct inode *inode); /* * namei.c @@ -2074,40 +2480,45 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *, umode_t); -unsigned char get_de_type(struct f2fs_dir_entry *); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, - f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct fscrypt_str *); -void do_make_empty_dir(struct inode *, struct inode *, - struct f2fs_dentry_ptr *); -struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, const struct qstr *, struct page *); -void update_parent_metadata(struct inode *, struct inode *, unsigned int); -int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *); -struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, - struct page **); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, - struct page **); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); -void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, - struct page *, struct inode *); -int update_dent_inode(struct inode *, struct inode *, const struct qstr *); -void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t , unsigned int); -int f2fs_add_regular_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, - nid_t, umode_t); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, - umode_t); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, - struct inode *); -int f2fs_do_tmpfile(struct inode *, struct inode *); -bool f2fs_empty_dir(struct inode *); +void set_de_type(struct f2fs_dir_entry *de, umode_t mode); +unsigned char get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, + const struct qstr *orig_name, struct page *dpage); +void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { @@ -2118,18 +2529,21 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *, bool); -void f2fs_inode_synced(struct inode *); -int f2fs_commit_super(struct f2fs_sb_info *, bool); -int f2fs_sync_fs(struct super_block *, int); +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) -void f2fs_msg(struct super_block *, const char *, const char *, ...); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c @@ -2137,164 +2551,190 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *); struct dnode_of_data; struct node_info; -bool available_free_memory(struct f2fs_sb_info *, int); -int need_dentry_mark(struct f2fs_sb_info *, nid_t); -bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool need_inode_block_update(struct f2fs_sb_info *, nid_t); -void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); -pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); -int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); -int truncate_inode_blocks(struct inode *, pgoff_t); -int truncate_xattr_node(struct inode *, struct page *); -int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *); -struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); -void ra_node_page(struct f2fs_sb_info *, nid_t); -struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_node_page_ra(struct page *, int); -void move_node_page(struct page *, int); -int fsync_node_pages(struct f2fs_sb_info *, struct inode *, - struct writeback_control *, bool); -int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *, bool); -bool alloc_nid(struct f2fs_sb_info *, nid_t *); -void alloc_nid_done(struct f2fs_sb_info *, nid_t); -void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -int try_to_free_nids(struct f2fs_sb_info *, int); -void recover_inline_xattr(struct inode *, struct page *); -void recover_xattr_data(struct inode *, struct page *, block_t); -int recover_inode_page(struct f2fs_sb_info *, struct page *); -int restore_node_summary(struct f2fs_sb_info *, unsigned int, - struct f2fs_summary_block *); -void flush_nat_entries(struct f2fs_sb_info *); -int build_node_manager(struct f2fs_sb_info *); -void destroy_node_manager(struct f2fs_sb_info *); +bool available_free_memory(struct f2fs_sb_info *sbi, int type); +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int truncate_inode_blocks(struct inode *inode, pgoff_t from); +int truncate_xattr_node(struct inode *inode, struct page *page); +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int remove_inode_page(struct inode *inode); +struct page *new_inode_page(struct inode *inode); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *get_node_page_ra(struct page *parent, int start); +void move_node_page(struct page *node_page, int gc_type); +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void recover_inline_xattr(struct inode *inode, struct page *page); +int recover_xattr_data(struct inode *inode, struct page *page, + block_t blkaddr); +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_node_manager(struct f2fs_sb_info *sbi); +void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* * segment.c */ -void register_inmem_page(struct inode *, struct page *); -void drop_inmem_pages(struct inode *); -int commit_inmem_pages(struct inode *); -void f2fs_balance_fs(struct f2fs_sb_info *, bool); -void f2fs_balance_fs_bg(struct f2fs_sb_info *); -int f2fs_issue_flush(struct f2fs_sb_info *); -int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); -void invalidate_blocks(struct f2fs_sb_info *, block_t); -bool is_checkpointed_data(struct f2fs_sb_info *, block_t); -void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); -void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); -void release_discard_addrs(struct f2fs_sb_info *); -int npages_for_summary_flush(struct f2fs_sb_info *, bool); -void allocate_new_segments(struct f2fs_sb_info *); -int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); -bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); -struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -void update_meta_page(struct f2fs_sb_info *, void *, block_t); -void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(unsigned int, struct f2fs_io_info *); -void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); -void rewrite_data_page(struct f2fs_io_info *); -void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, - block_t, block_t, bool, bool); -void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool, bool); -void allocate_data_block(struct f2fs_sb_info *, struct page *, - block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); -void write_data_summaries(struct f2fs_sb_info *, block_t); -void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); -int build_segment_manager(struct f2fs_sb_info *); -void destroy_segment_manager(struct f2fs_sb_info *); +bool need_SSR(struct f2fs_sb_info *sbi); +void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); +int commit_inmem_pages(struct inode *inode); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int create_flush_cmd_control(struct f2fs_sb_info *sbi); +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void stop_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void release_discard_addrs(struct f2fs_sb_info *sbi); +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); +void write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_segment_manager(struct f2fs_sb_info *sbi); +void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); -struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); -bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); -void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); -long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *, bool); -bool exist_written_data(struct f2fs_sb_info *, nid_t, int); -int f2fs_sync_inode_meta(struct f2fs_sb_info *); -int acquire_orphan_inode(struct f2fs_sb_info *); -void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct inode *); -void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); -int get_valid_checkpoint(struct f2fs_sb_info *); -void update_dirty_page(struct inode *, struct page *); -void remove_dirty_inode(struct inode *); -int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); -int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); -void init_ino_entry_info(struct f2fs_sb_info *); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type); +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int acquire_orphan_inode(struct f2fs_sb_info *sbi); +void release_orphan_inode(struct f2fs_sb_info *sbi); +void add_orphan_inode(struct inode *inode); +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int recover_orphan_inodes(struct f2fs_sb_info *sbi); +int get_valid_checkpoint(struct f2fs_sb_info *sbi); +void update_dirty_page(struct inode *inode, struct page *page); +void remove_dirty_inode(struct inode *inode); +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void init_ino_entry_info(struct f2fs_sb_info *sbi); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, - struct page *, nid_t, enum page_type, int); -void f2fs_flush_merged_bios(struct f2fs_sb_info *); -int f2fs_submit_page_bio(struct f2fs_io_info *); -int f2fs_submit_page_mbio(struct f2fs_io_info *); -struct block_device *f2fs_target_device(struct f2fs_sb_info *, - block_t, struct bio *); -int f2fs_target_device_index(struct f2fs_sb_info *, block_t); -void set_data_blkaddr(struct dnode_of_data *); -void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); -int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); -int reserve_new_block(struct dnode_of_data *); -int f2fs_get_block(struct dnode_of_data *, pgoff_t); -int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); -int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); -struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t, bool); -struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct f2fs_io_info *); -int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); -int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void f2fs_set_page_dirty_nobuffers(struct page *); -void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); -int f2fs_release_page(struct page *, gfp_t); +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_submit_page_write(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write); +struct page *find_data_page(struct inode *inode, pgoff_t index); +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length); +int f2fs_release_page(struct page *page, gfp_t wait); #ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *, struct page *, struct page *, - enum migrate_mode); +int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode); #endif /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *); -void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool, bool); -void build_gc_manager(struct f2fs_sb_info *); +int start_gc_thread(struct f2fs_sb_info *sbi); +void stop_gc_thread(struct f2fs_sb_info *sbi); +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); +void build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *, bool); -bool space_for_roll_forward(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -2311,11 +2751,15 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; - int inline_xattr, inline_inode, inline_dir, orphans; - int aw_cnt, max_aw_cnt; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2388,9 +2832,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) #define stat_inc_atomic_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) #define stat_dec_atomic_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) #define stat_update_max_atomic_write(inode) \ do { \ int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ @@ -2398,11 +2842,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2412,14 +2867,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2427,40 +2882,43 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) -int f2fs_build_stats(struct f2fs_sb_info *); -void f2fs_destroy_stats(struct f2fs_sb_info *); +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_atomic_write(inode) -#define stat_dec_atomic_write(inode) -#define stat_update_max_atomic_write(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -2483,53 +2941,79 @@ extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline_data(struct inode *); -bool f2fs_may_inline_dentry(struct inode *); -void read_inline_data(struct page *, struct page *); -bool truncate_inline_inode(struct page *, u64); -int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); -int f2fs_convert_inline_inode(struct inode *); -int f2fs_write_inline_data(struct inode *, struct page *); -bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct fscrypt_name *, struct page **); -int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, - struct inode *, struct inode *); -bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct fscrypt_str *); -int f2fs_inline_data_fiemap(struct inode *, - struct fiemap_extent_info *, __u64, __u64); +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void read_inline_data(struct page *page, struct page *ipage); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +bool recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); /* * shrinker.c */ -unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); -unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); -void f2fs_join_shrinker(struct f2fs_sb_info *); -void f2fs_leave_shrinker(struct f2fs_sb_info *); +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); -void f2fs_drop_extent_tree(struct inode *); -unsigned int f2fs_destroy_extent_node(struct inode *); -void f2fs_destroy_extent_tree(struct inode *); -bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); -void f2fs_update_extent_cache(struct dnode_of_data *); +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_drop_extent_tree(struct inode *inode); +unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t, block_t, unsigned int); -void init_extent_cache_info(struct f2fs_sb_info *); + pgoff_t fofs, block_t blkaddr, unsigned int len); +void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); /* + * sysfs.c + */ +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); + +/* * crypto support */ static inline bool f2fs_encrypted_inode(struct inode *inode) @@ -2537,6 +3021,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -2559,6 +3048,21 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) @@ -2606,28 +3110,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -#ifndef CONFIG_F2FS_FS_ENCRYPTION -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk -#endif #endif diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h deleted file mode 100644 index f113f1a1e8c1..000000000000 --- a/fs/f2fs/f2fs_crypto.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * linux/fs/f2fs/f2fs_crypto.h - * - * Copied from linux/fs/ext4/ext4_crypto.h - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption header content for f2fs - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#ifndef _F2FS_CRYPTO_H -#define _F2FS_CRYPTO_H - -#include <linux/fs.h> - -#define F2FS_KEY_DESCRIPTOR_SIZE 8 - -/* Policy provided via an ioctl on the topmost directory */ -struct f2fs_encryption_policy { - char version; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; -} __attribute__((__packed__)); - -#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -#define F2FS_KEY_DERIVATION_NONCE_SIZE 16 - -#define F2FS_POLICY_FLAGS_PAD_4 0x00 -#define F2FS_POLICY_FLAGS_PAD_8 0x01 -#define F2FS_POLICY_FLAGS_PAD_16 0x02 -#define F2FS_POLICY_FLAGS_PAD_32 0x03 -#define F2FS_POLICY_FLAGS_PAD_MASK 0x03 -#define F2FS_POLICY_FLAGS_VALID 0x03 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct f2fs_encryption_context { - char format; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; - char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE]; -} __attribute__((__packed__)); - -/* Encryption parameters */ -#define F2FS_XTS_TWEAK_SIZE 16 -#define F2FS_AES_128_ECB_KEY_SIZE 16 -#define F2FS_AES_256_GCM_KEY_SIZE 32 -#define F2FS_AES_256_CBC_KEY_SIZE 32 -#define F2FS_AES_256_CTS_KEY_SIZE 32 -#define F2FS_AES_256_XTS_KEY_SIZE 64 -#define F2FS_MAX_KEY_SIZE 64 - -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 - -struct f2fs_encryption_key { - __u32 mode; - char raw[F2FS_MAX_KEY_SIZE]; - __u32 size; -} __attribute__((__packed__)); - -struct f2fs_crypt_info { - char ci_data_mode; - char ci_filename_mode; - char ci_flags; - struct crypto_ablkcipher *ci_ctfm; - char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; -}; - -#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define F2FS_WRITE_PATH_FL 0x00000002 - -struct f2fs_crypto_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - char flags; /* Flags */ -}; - -struct f2fs_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \ - struct f2fs_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -static inline int f2fs_encryption_key_size(int mode) -{ - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - return F2FS_AES_256_XTS_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_GCM: - return F2FS_AES_256_GCM_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CBC: - return F2FS_AES_256_CBC_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - return F2FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - -#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4 -#define F2FS_CRYPTO_BLOCK_SIZE 16 -#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32 - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct f2fs_encrypted_symlink_data { - __le16 len; - char encrypted_path[1]; -} __attribute__((__packed__)); - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 encrypted_symlink_data_len(u32 l) -{ - return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1); -} -#endif /* _F2FS_CRYPTO_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e4e5d76d80b0..531379f513fa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,7 @@ #include <linux/mount.h> #include <linux/pagevec.h> #include <linux/random.h> +#include <linux/uio.h> #include <linux/uuid.h> #include <linux/file.h> @@ -33,6 +34,19 @@ #include "trace.h" #include <trace/events/f2fs.h> +static int f2fs_filemap_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&F2FS_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&F2FS_I(inode)->i_mmap_sem); + + return err; +} + static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -60,13 +74,14 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_balance_fs(sbi, dn.node_changed); file_update_time(vma->vm_file); + down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || !PageUptodate(page))) { unlock_page(page); err = -EFAULT; - goto out; + goto out_sem; } /* @@ -86,15 +101,19 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); +out_sem: + up_read(&F2FS_I(inode)->i_mmap_sem); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -102,7 +121,7 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, + .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, }; @@ -117,11 +136,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; @@ -142,8 +156,6 @@ static inline bool need_do_checkpoint(struct inode *inode) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; else if (test_opt(sbi, FASTBOOT)) need_cp = true; else if (sbi->active_logs == 2) @@ -169,7 +181,6 @@ static void try_to_fix_pino(struct inode *inode) nid_t pino; down_write(&fi->i_sem); - fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); @@ -268,9 +279,19 @@ sync_nodes: goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); @@ -278,7 +299,8 @@ sync_nodes: flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(sbi); + if (!atomic) + ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -375,7 +397,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -423,14 +446,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!f2fs_encrypted_inode(inode)) - return -ENOKEY; - } - /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -443,11 +458,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - int ret = generic_file_open(inode, filp); struct dentry *dir; - if (!ret && f2fs_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); + if (f2fs_encrypted_inode(inode)) { + int ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; if (!fscrypt_has_encryption_key(inode)) @@ -460,7 +474,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return -EPERM; } dput(dir); - return ret; + return dquot_file_open(inode, filp); } int truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -469,9 +483,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -531,12 +549,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || - !S_ISREG(inode->i_mode)) + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -569,10 +589,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - if (truncate_inline_inode(ipage, from)) - set_page_dirty(ipage); - if (from == 0) - clear_inode_flag(inode, FI_DATA_EXIST); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; @@ -621,6 +638,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -642,7 +665,6 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); - stat->blocks <<= 3; return 0; } @@ -686,14 +708,34 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + if (is_quota_modification(inode, attr)) { + err = dquot_initialize(inode); + if (err) + return err; + } + if ((attr->ia_valid & ATTR_UID && + !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && + !gid_eq(attr->ia_gid, inode->i_gid))) { + err = dquot_transfer(inode, attr); + if (err) + return err; + } + if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; + if (f2fs_encrypted_inode(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } if (attr->ia_size <= i_size_read(inode)) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); err = f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); if (err) return err; } else { @@ -701,7 +743,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); + up_write(&F2FS_I(inode)->i_mmap_sem); /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { @@ -847,12 +891,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -883,7 +929,8 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -959,15 +1006,15 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { f2fs_i_blocks_write(src_inode, - 1, false); + 1, false, false); f2fs_i_blocks_write(dst_inode, - 1, true); + 1, true, false); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, blkaddr[i], ni.version, true, false); @@ -1019,11 +1066,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1091,16 +1138,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - return ret; + goto out; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1113,6 +1161,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1126,7 +1176,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1137,8 +1188,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. @@ -1177,9 +1228,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - return ret; + goto out_sem; truncate_pagecache_range(inode, offset, offset + len - 1); @@ -1193,17 +1245,15 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - return ret; + goto out_sem; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1252,6 +1302,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, out: if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); +out_sem: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1264,8 +1316,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; @@ -1280,14 +1333,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) - return ret; + goto out; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); @@ -1316,6 +1370,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1435,6 +1491,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1442,17 +1499,20 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +static int f2fs_file_flush(struct file *file, fl_owner_t id) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; } static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) @@ -1481,28 +1541,34 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + ret = -EPERM; + goto unlock_out; + } + + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - inode_unlock(inode); ret = -EPERM; - goto out; + goto unlock_out; } } flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); -out: + f2fs_mark_inode_dirty_sync(inode, false); +unlock_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1522,6 +1588,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1536,20 +1605,27 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); -out: + clear_inode_flag(inode, FI_HOT_DATA); + goto out; + } + +inc_stat: + F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); +out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1580,10 +1656,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: inode_unlock(inode); @@ -1599,6 +1676,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1612,6 +1692,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1667,6 +1750,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } @@ -1712,7 +1796,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -1773,31 +1857,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1863,7 +1932,51 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync, true); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_range range; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) + return -EINVAL; +do_more: + if (!range.sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + range.start += sbi->blocks_per_seg; + if (range.start <= end) + goto do_more; out: mnt_drop_write_file(filp); return ret; @@ -1897,17 +2010,16 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; - struct extent_info ei; + struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -1941,7 +2053,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -1965,7 +2077,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -1983,7 +2095,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -2042,42 +2154,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) return -EINVAL; - err = mnt_want_write_file(filp); - if (err) - return err; - - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } + if (f2fs_readonly(sbi->sb)) + return -EROFS; if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } + sizeof(range))) + return -EFAULT; /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, @@ -2211,6 +2321,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) @@ -2220,6 +2332,79 @@ err_out: return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2251,12 +2436,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); default: return -ENOTTY; } @@ -2269,16 +2460,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; - if (f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - int err = f2fs_preallocate_blocks(iocb, from); + int err; + if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + + err = f2fs_preallocate_blocks(iocb, from); if (err) { inode_unlock(inode); return err; @@ -2286,6 +2476,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); + clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); @@ -2322,10 +2516,12 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: break; default: return -ENOIOCTLCMD; @@ -2341,6 +2537,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7f0c3e02408c..bd16e6631cf3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,17 +28,23 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; + set_freezable(); do { + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, + msecs_to_jiffies(wait_ms)); + + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; - else - wait_event_interruptible_timeout(*wq, - kthread_should_stop(), - msecs_to_jiffies(wait_ms)); if (kthread_should_stop()) break; @@ -48,10 +54,15 @@ static int gc_thread_func(void *data) } #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -66,23 +77,28 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; + + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -90,6 +106,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; @@ -107,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -166,10 +187,15 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > sbi->max_victim_search) + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -179,7 +205,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -199,8 +225,12 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; + + if (no_fggc_candidate(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -208,8 +238,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -218,7 +248,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -237,6 +267,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } +static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int valid_blocks = + get_valid_blocks(sbi, segno, true); + + return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + valid_blocks * 2 : valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -245,7 +285,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + return get_greedy_cost(sbi, segno); else return get_cb_cost(sbi, segno); } @@ -274,6 +314,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -287,10 +328,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -303,9 +352,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -322,13 +372,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && p.alloc_mode == LFS && + no_fggc_candidate(sbi, secno)) + goto next; cost = get_gc_cost(sbi, segno, &p); @@ -338,17 +390,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -531,12 +584,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) @@ -544,15 +599,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_READ, .op_flags = REQ_SYNC, .encrypted_page = NULL, + .in_list = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -596,7 +657,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA); + &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -634,7 +695,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -676,10 +739,14 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; @@ -768,8 +835,7 @@ next_step: continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -803,14 +869,18 @@ next_step: continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx, segno, off); + if (f2fs_encrypted_file(inode)) + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type, segno, off); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -847,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -871,7 +941,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -886,7 +956,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -894,90 +963,113 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } if (gc_type == FG_GC) - f2fs_submit_merged_bio(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret = -EINVAL; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: - segno = NULL_SEGNO; - - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + ret = -EINVAL; goto stop; + } if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { - gc_type = FG_GC; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { /* - * If there is no victim and no prefree segment but still not - * enough free sections, we should flush dent/node blocks and do - * garbage collections. + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. */ - if (__get_victim(sbi, &segno, gc_type) || - prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - segno = NULL_SEGNO; - } else if (has_not_enough_free_secs(sbi, 0, 0)) { + if (prefree_segments(sbi)) { ret = write_checkpoint(sbi, &cpc); if (ret) goto stop; } - } else if (gc_type == BG_GC && !background) { - /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - goto stop; + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; } - if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && !background) { + ret = -EINVAL; goto stop; - ret = 0; + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; + goto stop; + } - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); @@ -989,5 +1081,20 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { + u64 main_count, resv_count, ovp_count; + DIRTY_I(sbi)->v_ops = &default_v_ops; + + /* threshold of # of valid blocks in a section for victims of FG_GC */ + main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; + resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { @@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..eb2e031ea887 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index fb5d7d1f34aa..fbf22b0f667f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -23,10 +23,10 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; @@ -45,6 +45,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -52,31 +53,33 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) - return false; + if (from >= MAX_INLINE_DATA(inode)) + return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -132,6 +135,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; @@ -153,6 +157,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { @@ -164,11 +169,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -215,6 +220,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -231,11 +238,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -270,9 +282,9 @@ process_inline: f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -285,9 +297,8 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -301,11 +312,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -314,11 +325,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -332,19 +343,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *dentry_blk; struct f2fs_dentry_ptr d; + void *inline_dentry; - dentry_blk = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -353,11 +364,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -372,25 +384,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -398,7 +409,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -411,14 +422,13 @@ out: return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -460,20 +470,20 @@ punch_dentry_pages: } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -489,9 +499,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -500,7 +510,7 @@ recover: } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -516,7 +526,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *dentry_blk = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -526,11 +536,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - dentry_blk = inline_data_addr(ipage); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { - err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { + err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; @@ -545,14 +556,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + name_hash = f2fs_dentry_hash(new_name, NULL); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -575,7 +583,8 @@ out: void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -583,11 +592,12 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); @@ -604,20 +614,21 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *dentry_blk; + void *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - dentry_blk = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -627,25 +638,27 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) - ctx->pos = NR_INLINE_DENTRY; + ctx->pos = d.max; f2fs_put_page(ipage, 1); return err < 0 ? err : 0; @@ -670,7 +683,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -679,7 +692,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index af06bda51a54..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,7 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include <trace/events/f2fs.h> @@ -44,25 +45,26 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -71,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { @@ -104,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -130,7 +206,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = le64_to_cpu(ri->i_blocks); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -153,6 +229,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -166,6 +245,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -226,6 +315,7 @@ make_now: ret = -EIO; goto bad_inode; } + f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -267,7 +357,7 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(inode->i_blocks); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); if (et) { read_lock(&et->lock); @@ -291,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -316,7 +420,6 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -339,7 +442,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode) && wbc && wbc->nr_to_write) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -372,10 +476,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) - goto no_delete; -#endif + dquot_initialize(inode); remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -387,6 +488,12 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); @@ -403,13 +510,22 @@ retry: if (err) update_inode_page(inode); + dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: + dquot_drop(inode); + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { @@ -421,9 +537,10 @@ no_delete: if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); + } else { + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); } - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); @@ -446,6 +563,7 @@ void handle_failed_inode(struct inode *inode) * in a panic when flushing dirty inodes in gdirty_list. */ update_inode_page(inode); + f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index db3079cd665d..d92b8e9064cb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,6 +15,7 @@ #include <linux/ctype.h> #include <linux/dcache.h> #include <linux/namei.h> +#include <linux/quotaops.h> #include "f2fs.h" #include "node.h" @@ -42,6 +43,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } f2fs_unlock_op(sbi); + nid_free = true; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; @@ -52,16 +55,35 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) err = insert_inode_locked(inode); if (err) { err = -EINVAL; - nid_free = true; goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + + err = dquot_initialize(inode); + if (err) + goto fail_drop; + + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -75,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -85,6 +116,16 @@ fail: set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int is_multimedia_file(const unsigned char *s, const char *sub) @@ -136,6 +177,10 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -148,8 +193,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +206,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -180,6 +225,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); inode->i_ctime = current_time(inode); @@ -233,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -324,9 +382,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); @@ -346,6 +405,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + err = dquot_initialize(dir); + if (err) + return err; + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { if (IS_ERR(page)) @@ -400,7 +463,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); @@ -409,6 +472,10 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -420,8 +487,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -444,7 +509,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, goto err_out; if (!fscrypt_has_encryption_key(inode)) { - err = -EPERM; + err = -ENOKEY; goto err_out; } @@ -484,6 +549,8 @@ err_out: } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -496,6 +563,10 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -505,8 +576,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -521,6 +590,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -544,6 +615,10 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -551,8 +626,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -566,6 +639,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -579,6 +654,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -592,8 +671,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -619,6 +696,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: @@ -672,6 +751,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -717,13 +809,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -775,9 +860,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + F2FS_I(old_inode)->i_pino = new_dir->i_ino; up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -858,6 +944,22 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -905,8 +1007,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } @@ -914,18 +1016,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -941,7 +1031,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = CURRENT_TIME; + old_dir->i_ctime = current_time(old_dir); if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -956,7 +1046,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = CURRENT_TIME; + new_dir->i_ctime = current_time(new_dir); if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); @@ -969,14 +1059,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc67dc323f7e..32474db18ad9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,10 +19,11 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include <trace/events/f2fs.h> -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -63,8 +64,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * @@ -157,9 +159,6 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; - if (get_nat_flag(ne, IS_DIRTY)) - return; - head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); @@ -170,25 +169,27 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } - list_move_tail(&ne->list, &head->entry_list); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + nm_i->dirty_nat_cnt++; head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + if (nat_get_blkaddr(ne) == NEW_ADDR) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -381,6 +382,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -406,17 +408,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); @@ -549,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -573,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -608,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -649,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -673,15 +682,11 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - if (dn->inode->i_blocks == 0) { - f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); - goto invalidate; - } f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { @@ -689,7 +694,7 @@ static void truncate_node(struct dnode_of_data *dn) dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } -invalidate: + clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -875,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -971,9 +978,6 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - /* need to do checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); - set_new_dnode(&dn, inode, page, npage, nid); if (page) @@ -1009,7 +1013,7 @@ int remove_inode_page(struct inode *inode) /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), - inode->i_blocks != 0 && inode->i_blocks != 1); + inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ truncate_node(&dn); @@ -1024,14 +1028,13 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct node_info old_ni, new_ni; + struct node_info new_ni; struct page *page; int err; @@ -1042,17 +1045,18 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { - err = -ENOSPC; + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) goto fail; - } - get_node_info(sbi, dn->nid, &old_ni); - - /* Reinitialize old_ni with new node page */ - f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); - new_ni = old_ni; +#ifdef CONFIG_F2FS_CHECK_FS + get_node_info(sbi, dn->nid, &new_ni); + f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); +#endif + new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_wait_on_page_writeback(page, NODE, true); @@ -1153,6 +1157,7 @@ repeat: f2fs_put_page(page, 1); return ERR_PTR(err); } else if (err == LOCKED_PAGE) { + err = 0; goto page_hit; } @@ -1166,15 +1171,27 @@ repeat: goto repeat; } - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + err = -EIO; goto out_err; + } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { - f2fs_bug_on(sbi, 1); - ClearPageUptodate(page); + f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " + "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + return ERR_PTR(err); } return page; } @@ -1318,16 +1335,103 @@ continue_unlock: return last_page; } +static int __write_node_page(struct page *page, bool atomic, bool *submitted, + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + .submitted = false, + .io_type = io_type, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= WRITE_FLUSH_FUA; + + set_page_writeback(page); + fio.old_blkaddr = ni.blk_addr; + write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + + if (wbc->for_reclaim) { + f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, + page->index, NODE); + submitted = NULL; + } + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, NODE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; + + if (do_balance) + f2fs_balance_fs(sbi, false); + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); +} + int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index, end; + pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1349,6 +1453,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); @@ -1380,6 +1485,9 @@ continue_unlock: f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { @@ -1397,13 +1505,16 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + ret = __write_node_page(page, atomic && + page == last_page, + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } if (page == last_page) { @@ -1429,12 +1540,13 @@ continue_unlock: goto retry; } out: - if (nwritten) - f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1458,6 +1570,7 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); @@ -1511,9 +1624,11 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type); + if (ret) unlock_page(page); - else + else if (submitted) nwritten++; if (--wbc->nr_to_write == 0) @@ -1534,7 +1649,7 @@ continue_unlock: } out: if (nwritten) - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); return ret; } @@ -1580,72 +1695,6 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) return ret; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - nid_t nid; - struct node_info ni; - struct f2fs_io_info fio = { - .sbi = sbi, - .type = NODE, - .op = REQ_OP_WRITE, - .op_flags = wbc_to_write_flags(wbc), - .page = page, - .encrypted_page = NULL, - }; - - trace_f2fs_writepage(page, NODE); - - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; - - /* get old block addr of this node page */ - nid = nid_of_node(page); - f2fs_bug_on(sbi, page->index != nid); - - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - - get_node_info(sbi, nid, &ni); - - /* This page is already truncated */ - if (unlikely(ni.blk_addr == NULL_ADDR)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - unlock_page(page); - return 0; - } - - set_page_writeback(page); - fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - - if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); - - unlock_page(page); - - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, NODE, WRITE); - - return 0; - -redirty_out: - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1653,6 +1702,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct blk_plug plug; long diff; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1665,7 +1717,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1743,43 +1795,71 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; + struct free_nid *i, *e; struct nat_entry *ne; - int err; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) - return 0; - - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return 0; - } + return false; i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return 0; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } + } + ret = true; err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - if (err) { +err: + if (err) kmem_cache_free(free_nid_slab, i); - return 0; - } - return 1; + return ret; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1800,17 +1880,45 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + else + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + + if (set) + nm_i->free_nid_count[nat_ofs]++; + else if (!build) + nm_i->free_nid_count[nat_ofs]--; +} + static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + bool freed = false; if (unlikely(start_nid >= nm_i->max_nid)) break; @@ -1818,11 +1926,58 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) - add_free_nid(sbi, start_nid, true); + freed = add_free_nid(sbi, start_nid, true); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + unsigned int i, idx; + + down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + if (!nm_i->free_nid_count[i]) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + nid_t nid; + + if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) + continue; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true); + + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + goto out; + } + } +out: + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); +} + +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1830,6 +1985,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; @@ -1837,6 +1995,14 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID_LIST]) + return; + } + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1879,10 +2045,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync); + __build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1897,8 +2063,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(FAULT_ALLOC_NID); return false; + } #endif spin_lock(&nm_i->nid_list_lock); @@ -1918,13 +2086,16 @@ retry: i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false, false); + spin_unlock(&nm_i->nid_list_lock); return true; } spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true); + build_free_nids(sbi, true, false); goto retry; } @@ -1972,6 +2143,8 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; + update_free_nid_bitmap(sbi, nid, true, false); + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2034,38 +2207,47 @@ update_inode: f2fs_put_page(ipage, 1); } -void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; + struct page *xpage; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; - /* Deallocate node address */ + /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, inode); + dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: - /* 2: allocate new xattr nid */ - if (unlikely(!inc_valid_node_count(sbi, inode))) - f2fs_bug_on(sbi, 1); + /* 2: update xattr nid in inode */ + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } + + alloc_nid_done(sbi, new_xnid); + update_inode_page(inode); - remove_free_nid(sbi, new_xnid); - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); - f2fs_i_xnid_write(inode, new_xnid); + /* 3: update and set xattr node page dirty */ + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - /* 3: update xattr blkaddr */ - refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr, false); + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + return 0; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2101,12 +2283,19 @@ retry: dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL))) + if (unlikely(inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); @@ -2208,8 +2397,39 @@ add_out: list_add_tail(&nes->set_list, head); } +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { + if (start_nid == 0 && i == 0) + valid++; + if (nat_blk->entries[i].block_addr) + valid++; + } + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); +} + static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, - struct nat_entry_set *set) + struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; @@ -2224,7 +2444,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -2241,8 +2462,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nid_t nid = nat_get_nid(ne); int offset; - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { offset = lookup_journal_in_cursum(journal, @@ -2255,30 +2475,38 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; + update_free_nid_bitmap(sbi, nid, true, false); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } - if (to_journal) + if (to_journal) { up_write(&curseg->journal_rwsem); - else + } else { + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); + } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2299,7 +2527,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -2313,11 +2542,86 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set); + __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); + /* Allow dirty nats by node block allocation in write_begin */ +} + +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + + F2FS_BLKSIZE - 1); + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, + GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page = get_meta_page(sbi, nat_bits_addr++); + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (cur_cp_crc(ckpt) << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; + } + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); + return 0; +} + +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + + spin_lock(&NM_I(sbi)->nid_list_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -2325,15 +2629,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi) struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned char *version_bitmap; - unsigned int nat_segs, nat_blocks; + unsigned int nat_segs; + int err; nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - @@ -2367,6 +2671,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + err = __get_nat_bitmaps(sbi); + if (err) + return err; + #ifdef CONFIG_F2FS_CHECK_FS nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); @@ -2377,6 +2685,27 @@ static int init_node_manager(struct f2fs_sb_info *sbi) return 0; } +static int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * + NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * + sizeof(unsigned short), GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; + return 0; +} + int build_node_manager(struct f2fs_sb_info *sbi) { int err; @@ -2389,7 +2718,14 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi, true); + err = init_free_nid_cache(sbi); + if (err) + return err; + + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + + build_free_nids(sbi, true, true); return 0; } @@ -2447,7 +2783,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) } up_write(&nm_i->nat_tree_lock); + kvfree(nm_i->nat_block_bitmap); + kvfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_count); + kfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS kfree(nm_i->nat_bitmap_mir); #endif diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 29ff783eb9c3..bb53e9955ff2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) @@ -200,21 +200,18 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); -#ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != - f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) - f2fs_bug_on(sbi, 1); -#endif - if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -227,11 +224,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - if ((block_addr >> sbi->log_blocks_per_seg) % 2) - block_addr -= sbi->blocks_per_seg; - else - block_addr += sbi->blocks_per_seg; - + block_addr ^= 1 << sbi->log_blocks_per_seg; return block_addr + nm_i->nat_blkaddr; } @@ -306,14 +299,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } @@ -321,14 +311,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) static inline bool is_recoverable_dnode(struct page *page) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); __u64 cp_ver = cur_cp_version(ckpt); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + return cp_ver == cpver_of_node(page); } @@ -358,7 +345,7 @@ static inline bool IS_DNODE(struct page *node_page) unsigned int ofs = ofs_of_node(node_page); if (f2fs_has_xattr_block(ofs)) - return false; + return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4a3c48c24c10..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ retry: err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -198,7 +220,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -225,17 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { - if (IS_INODE(page) && is_dent_dnode(page)) { + bool quota_inode = false; + + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -326,10 +354,18 @@ got_it: f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -359,7 +395,8 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -378,11 +415,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - /* - * Deprecated; xattr blocks should be found from cold log. - * But, we should remain this for backward compatibility. - */ - recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page, blkaddr); + if (!err) + recovered++; goto out; } @@ -414,8 +449,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) @@ -428,8 +463,9 @@ retry_dn: } if (!file_keep_isize(inode) && - (i_size_read(inode) <= (start << PAGE_SHIFT))) - f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block @@ -556,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct list_head dir_list; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -570,13 +621,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) mutex_lock(&sbi->cp_mutex); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -585,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -598,8 +649,6 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ @@ -613,5 +662,12 @@ out: } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 289b3facd2d8..059a219b7740 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,10 +16,13 @@ #include <linux/kthread.h> #include <linux/swap.h> #include <linux/timer.h> +#include <linux/freezer.h> +#include <linux/sched.h> #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -166,6 +169,21 @@ found: return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -212,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } @@ -247,9 +271,40 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + static int __commit_inmem_pages(struct inode *inode, struct list_head *revoke_list) { @@ -260,10 +315,10 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, - .encrypted_page = NULL, + .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; - bool submit_bio = false; + pgoff_t last_idx = ULONG_MAX; int err = 0; list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { @@ -279,25 +334,31 @@ static int __commit_inmem_pages(struct inode *inode, inode_dec_dirty_pages(inode); remove_dirty_inode(inode); } - +retry: fio.page = page; + fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - - submit_bio = true; + last_idx = page->index; } unlock_page(page); list_move_tail(&cur->list, revoke_list); } - if (submit_bio) - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -352,15 +413,14 @@ int commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -369,7 +429,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -386,9 +446,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false); + build_free_nids(sbi, false, false); - if (!is_idle(sbi)) + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -409,7 +469,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } -static int __submit_flush_wait(struct block_device *bdev) +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) { struct bio *bio = f2fs_bio_alloc(0); int ret; @@ -418,20 +479,24 @@ static int __submit_flush_wait(struct block_device *bdev) bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi) { - int ret = __submit_flush_wait(sbi->sb->s_bdev); + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); int i; - if (sbi->s_ndevs && !ret) { - for (i = 1; i < sbi->s_ndevs; i++) { - ret = __submit_flush_wait(FDEV(i).bdev); - if (ret) - break; - } + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; } return ret; } @@ -445,6 +510,8 @@ repeat: if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -453,6 +520,8 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; @@ -461,6 +530,8 @@ repeat: fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -470,36 +541,60 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; - - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - int ret; + if (!test_opt(sbi, FLUSH_MERGE)) { + ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + return ret; + } - atomic_inc(&fcc->submit_flush); + if (atomic_inc_return(&fcc->issing_flush) == 1) { ret = submit_flush_wait(sbi); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); } else { - llist_del_all(&fcc->issue_list); - atomic_set(&fcc->submit_flush, 0); + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } } return cmd.ret; @@ -513,16 +608,22 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; goto init_thread; } fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); @@ -592,8 +693,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) - clear_bit(GET_SECNO(sbi, segno), + if (get_valid_blocks(sbi, segno, true) == 0) + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -613,7 +714,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -628,61 +729,91 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, - struct bio *bio, block_t lstart, block_t len) +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; struct discard_cmd *dc; + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); - dc->bio = bio; + dc->bdev = bdev; dc->lstart = lstart; + dc->start = start; dc->len = len; + dc->ref = 0; + dc->state = D_PREP; + dc->error = 0; init_completion(&dc->wait); - list_add_tail(&dc->list, wait_list); + list_add_tail(&dc->list, pend_list); + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; return dc; } -/* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); - struct discard_cmd *dc, *tmp; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; - list_for_each_entry_safe(dc, tmp, wait_list, list) { - struct bio *bio = dc->bio; - int err; - - if (!completion_done(&dc->wait)) { - if ((dc->lstart <= blkaddr && - blkaddr < dc->lstart + dc->len) || - blkaddr == NULL_ADDR) - wait_for_completion_io(&dc->wait); - else - continue; - } + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); - err = bio->bi_error; - if (err == -EOPNOTSUPP) - err = 0; + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); + return dc; +} - bio_put(bio); - list_del(&dc->list); - kmem_cache_free(discard_cmd_slab, dc); - } +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_dec(&dcc->issing_discard); + + list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + f2fs_bug_on(sbi, dc->ref); + + if (dc->error == -EOPNOTSUPP) + dc->error = 0; + + if (dc->error) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->lstart, dc->start, dc->len, dc->error); + __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - complete(&dc->wait); + dc->error = bio->bi_error; + dc->state = D_DONE; + complete_all(&dc->wait); + bio_put(bio); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -739,12 +870,12 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, } if (bio) { - int ret = submit_bio_wait(0, bio); + int ret = submit_bio_wait(op, bio); bio_put(bio); if (ret) return ret; } - bio = f2fs_bio_alloc(0); + bio = f2fs_bio_alloc(1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); @@ -766,58 +897,471 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk = START_BLOCK(sbi, segno + 1); + } +#endif +} + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, - struct block_device *bdev, block_t blkstart, block_t blklen) +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct bio *bio = NULL; + + if (dc->state != D_PREP) + return; + + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(REQ_SYNC, bio); + list_move_tail(&dc->list, &dcc->wait_list); + __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); + } + } else { + __remove_discard_cmd(sbi, dc); + } +} + +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + mutex_unlock(&dcc->cmd_lock); +} + +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ block_t lblkstart = blkstart; - int err; - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + trace_f2fs_queue_discard(bdev, blkstart, blklen); if (sbi->s_ndevs) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(blkstart), - SECTOR_FROM_BLOCK(blklen), - GFP_NOFS, 0, &bio); - if (!err && bio) { - struct discard_cmd *dc = __add_discard_cmd(sbi, bio, - lblkstart, blklen); + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + return 0; +} + +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int iter = 0, issued = 0; + int i; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { + __submit_discard_cmd(sbi, dc); + issued++; + + if (fatal_signal_pending(current)) + break; + continue; + } + + if (!issue_cond) { + __submit_discard_cmd(sbi, dc); + issued++; + continue; + } + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } - bio->bi_private = dc; - bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(REQ_SYNC, bio); + if (++iter >= DISCARD_ISSUE_RATE) + goto out; + } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); } - return err; +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; + bool need_wait; + +next: + need_wait = false; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } else { + dc->ref++; + need_wait = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + __wait_one_discard_bio(sbi, dc); + goto next; + } +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) + __wait_one_discard_bio(sbi, dc); +} + +void stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + +/* This comes from f2fs_put_super and f2fs_trim_fs */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +{ + __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); + __wait_discard_cmd(sbi, false); +} + +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; + + set_freezable(); + + do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (try_to_freeze()) + continue; + if (kthread_should_stop()) + return 0; + + if (dcc->discard_wake) { + dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } + + sb_end_intwrite(sbi->sb); + + } while (!kthread_should_stop()); + return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - sector_t sector; + sector_t sector, nr_sects; + block_t lblkstart = blkstart; int devi = 0; if (sbi->s_ndevs) { devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - sector = SECTOR_FROM_BLOCK(blkstart); - - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { - f2fs_msg(sbi->sb, KERN_INFO, - "(%d) %s: Unaligned discard attempted (block %x + %x)", - devi, sbi->s_ndevs ? FDEV(devi).path: "", - blkstart, blklen); - return -EIO; - } /* * We need to know the type of the zone: for conventional zones, @@ -829,10 +1373,21 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - trace_f2fs_issue_reset_zone(sbi->sb, blkstart); + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } + trace_f2fs_issue_reset_zone(bdev, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: @@ -850,7 +1405,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -893,31 +1448,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->discard_entry_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->nr_discards += end - start; -} - static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { @@ -929,7 +1459,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -937,7 +1469,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) return false; } @@ -946,7 +1479,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; @@ -959,14 +1493,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (check_only) return true; - __add_discard_entry(sbi, cpc, se, start, end); + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -992,16 +1536,14 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); - - blk_start_plug(&plug); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -1031,32 +1573,117 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); start = start_segno + sbi->segs_per_sec; if (start < end) goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (f2fs_sb_mounted_blkzoned(sbi->sb) || + (force && len < cpc->trim_minlen)) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->nr_discards -= entry->len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - blk_finish_plug(&plug); + wake_up_discard_thread(sbi, false); +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc; + int err = 0, i; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) { + INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } + INIT_LIST_HEAD(&dcc->wait_list); + mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); + dcc->nr_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->undiscard_blks = 0; + dcc->root = RB_ROOT; + + init_waitqueue_head(&dcc->discard_wait_queue); + SM_I(sbi)->dcc_info = dcc; +init_thread: + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + return err; + } + + return err; +} + +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (!dcc) + return; + + stop_discard_thread(sbi); + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -1085,6 +1712,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -1101,32 +1732,54 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_and_set_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } #endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; + + /* don't overwrite by SSR to keep node chain */ + if (se->type == CURSEG_WARM_NODE) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; + } } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_test_and_clear_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } #endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -1312,8 +1965,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1323,8 +1976,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1355,8 +2008,8 @@ find_other_zone: secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1400,7 +2053,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; @@ -1413,6 +2066,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1431,6 +2098,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); @@ -1473,7 +2141,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1494,28 +2162,53 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; + int i, cnt; + bool reversed = false; + + /* need_SSR() already forces to do this */ + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; + return 1; + } - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) - return v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(type)) { + if (type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; + } else { + if (type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; + } - /* For data segments, let's do SSR more intensively */ - for (; type >= CURSEG_HOT_DATA; type--) - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, type, SSR)) + for (; cnt-- > 0; reversed ? i-- : i++) { + if (i == type) + continue; + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } + } return 0; } @@ -1530,12 +2223,13 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); - else if (type == CURSEG_WARM_NODE) + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -1563,14 +2257,19 @@ static const struct segment_allocation default_salloc_ops = { bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; + bool has_candidate = false; mutex_lock(&SIT_I(sbi)->sentry_lock); - for (; trim_start <= cpc->trim_end; trim_start++) - if (add_discard_addrs(sbi, cpc, true)) + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; break; + } + } mutex_unlock(&SIT_I(sbi)->sentry_lock); - return trim_start <= cpc->trim_end; + cpc->trim_start = trim_start; + return has_candidate; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) @@ -1623,6 +2322,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -1636,68 +2338,80 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -static int __get_segment_type_2(struct page *page, enum page_type p_type) +static int __get_segment_type_2(struct f2fs_io_info *fio) { - if (p_type == DATA) + if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } -static int __get_segment_type_4(struct page *page, enum page_type p_type) +static int __get_segment_type_4(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && is_cold_node(page)) + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } -static int __get_segment_type_6(struct page *page, enum page_type p_type) +static int __get_segment_type_6(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { - if (IS_DNODE(page)) - return is_cold_node(page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } -static int __get_segment_type(struct page *page, enum page_type p_type) +static int __get_segment_type(struct f2fs_io_info *fio) { - switch (F2FS_P_SB(page)->active_logs) { + int type = 0; + + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(page, p_type); + type = __get_segment_type_2(fio); + break; case 4: - return __get_segment_type_4(page, p_type); + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); } - /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(F2FS_P_SB(page), - F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); - return __get_segment_type_6(page, p_type); + + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type) + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1723,42 +2437,52 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + + if (add_list) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + mutex_unlock(&curseg->curseg_mutex); } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio->page, fio->type); + int type = __get_segment_type(fio); int err; - if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type]); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type); + &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ - err = f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_write(fio); if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; } - - if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, @@ -1769,13 +2493,16 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, + .in_list = false, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -1784,6 +2511,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -1797,13 +2526,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -1845,7 +2583,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -1864,7 +2602,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } @@ -1894,7 +2632,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, type); if (ordered) wait_on_page_writeback(page); else @@ -1902,8 +2641,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; @@ -2052,6 +2790,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; @@ -2078,6 +2818,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } @@ -2367,7 +3112,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } @@ -2403,7 +3148,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) @@ -2431,13 +3176,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -2470,7 +3215,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -2521,12 +3266,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2602,10 +3347,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2629,10 +3381,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2676,7 +3433,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { @@ -2694,7 +3451,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2716,7 +3473,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2782,22 +3539,22 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - - INIT_LIST_HEAD(&sm_info->discard_entry_list); - INIT_LIST_HEAD(&sm_info->discard_cmd_list); - sm_info->nr_discards = 0; - sm_info->max_discards = 0; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; INIT_LIST_HEAD(&sm_info->sit_entry_set); - if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) return err; } + err = create_discard_cmd_control(sbi); + if (err) + return err; + err = build_sit_info(sbi); if (err) return err; @@ -2919,6 +3676,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5cb5755c75d9..ffa11274b0ce 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -21,78 +21,88 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) + +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ + ((segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -103,7 +113,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -132,7 +142,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -227,6 +240,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { @@ -303,17 +318,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -358,8 +373,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -379,7 +394,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -390,8 +406,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -412,7 +428,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -475,27 +492,9 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; -} - -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + 1); + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -540,6 +539,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, @@ -547,20 +547,22 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -572,6 +574,15 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) @@ -716,6 +727,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } +static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, + unsigned int secno) +{ + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= + sbi->fggc_threshold) + return true; + return false; +} + static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) @@ -727,8 +747,8 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, - * 512 pages (2MB) * 3 for three types of nodes, and - * max_bio_blocks for meta are set. + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { @@ -764,3 +784,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 84d5686c4aa4..315e59ad1483 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -22,8 +22,10 @@ #include <linux/random.h> #include <linux/exportfs.h> #include <linux/blkdev.h> +#include <linux/quotaops.h> #include <linux/f2fs_fs.h> #include <linux/sysfs.h> +#include <linux/quota.h> #include "f2fs.h" #include "node.h" @@ -35,9 +37,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> -static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; -static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; @@ -82,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -89,6 +91,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_noinline_xattr, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -105,6 +108,20 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, + Opt_usrquota, + Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -115,6 +132,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -122,6 +140,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -138,225 +157,139 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; -/* Sysfs support for f2fs */ -enum { - GC_THREAD, /* struct f2fs_gc_thread */ - SM_INFO, /* struct f2fs_sm_info */ - NM_INFO, /* struct f2fs_nm_info */ - F2FS_SBI, /* struct f2fs_sb_info */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - FAULT_INFO_RATE, /* struct f2fs_fault_info */ - FAULT_INFO_TYPE, /* struct f2fs_fault_info */ -#endif -}; - -struct f2fs_attr { - struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); - int struct_type; - int offset; -}; - -static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) -{ - if (struct_type == GC_THREAD) - return (unsigned char *)sbi->gc_thread; - else if (struct_type == SM_INFO) - return (unsigned char *)SM_I(sbi); - else if (struct_type == NM_INFO) - return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) - return (unsigned char *)sbi; -#ifdef CONFIG_F2FS_FAULT_INJECTION - else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; -#endif - return NULL; -} - -static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); + struct va_format vaf; + va_list args; - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); } -static ssize_t f2fs_sbi_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) +static void init_once(void *foo) { - unsigned char *ptr = NULL; - unsigned int *ui; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); + struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); + inode_init_once(&fi->vfs_inode); } -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, - const char *buf, size_t count) +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) { - unsigned char *ptr; - unsigned long t; - unsigned int *ui; - ssize_t ret; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret < 0) - return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); return -EINVAL; -#endif - *ui = t; - return count; -} - -static ssize_t f2fs_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; } -static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) { - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + struct f2fs_sb_info *sbi = F2FS_SB(sb); - return a->store ? a->store(a, sbi, buf, len) : 0; + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; } -static void f2fs_sb_release(struct kobject *kobj) +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) { - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .struct_type = _struct_type, \ - .offset = _offset \ + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; } - -#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ - F2FS_ATTR_OFFSET(struct_type, name, 0644, \ - f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) - -#define F2FS_GENERAL_RO_ATTR(name) \ -static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) - -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -#ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); -#endif -F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); - -#define ATTR_LIST(name) (&f2fs_attr_##name.attr) -static struct attribute *f2fs_attrs[] = { - ATTR_LIST(gc_min_sleep_time), - ATTR_LIST(gc_max_sleep_time), - ATTR_LIST(gc_no_gc_sleep_time), - ATTR_LIST(gc_idle), - ATTR_LIST(reclaim_segments), - ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), - ATTR_LIST(ipu_policy), - ATTR_LIST(min_ipu_util), - ATTR_LIST(min_fsync_blocks), - ATTR_LIST(max_victim_search), - ATTR_LIST(dir_level), - ATTR_LIST(ram_thresh), - ATTR_LIST(ra_nid_pages), - ATTR_LIST(dirty_nats_ratio), - ATTR_LIST(cp_interval), - ATTR_LIST(idle_interval), -#ifdef CONFIG_F2FS_FAULT_INJECTION - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), #endif - ATTR_LIST(lifetime_write_kbytes), - NULL, -}; - -static const struct sysfs_ops f2fs_attr_ops = { - .show = f2fs_attr_show, - .store = f2fs_attr_store, -}; - -static struct kobj_type f2fs_ktype = { - .default_attrs = f2fs_attrs, - .sysfs_ops = &f2fs_attr_ops, - .release = f2fs_sb_release, -}; - -void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); - va_end(args); -} - -static void init_once(void *foo) -{ - struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; - - inode_init_once(&fi->vfs_inode); -} static int parse_options(struct super_block *sb, char *options) { @@ -365,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -431,6 +367,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -441,6 +380,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_xattr: set_opt(sbi, INLINE_XATTR); break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -454,6 +396,10 @@ static int parse_options(struct super_block *sb, char *options) f2fs_msg(sb, KERN_INFO, "inline_xattr options not supported"); break; + case Opt_noinline_xattr: + f2fs_msg(sb, KERN_INFO, + "noinline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_acl: @@ -553,6 +499,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg); + set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -564,6 +511,81 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nolazytime: sb->s_flags &= ~MS_LAZYTIME; break; +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: + f2fs_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -571,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_msg(sb, KERN_ERR, @@ -603,14 +629,22 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); +#ifdef CONFIG_QUOTA + memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); + fi->i_reserved_quota = 0; +#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } static int f2fs_drop_inode(struct inode *inode) { + int ret; /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) @@ -643,10 +677,12 @@ static int f2fs_drop_inode(struct inode *inode) spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } + trace_f2fs_drop_inode(inode, 0); return 0; } - - return generic_drop_inode(inode); + ret = generic_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; } int f2fs_inode_dirtied(struct inode *inode, bool sync) @@ -744,15 +780,9 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - kobject_del(&sbi->s_kobj); - - stop_gc_thread(sbi); + f2fs_quota_off_umount(sb); /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -771,7 +801,14 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bio(sbi, NULL_ADDR); + f2fs_wait_discard_bios(sbi); + + if (f2fs_discard_en(sbi) && !sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); @@ -786,7 +823,7 @@ static void f2fs_put_super(struct super_block *sb) mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -796,8 +833,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -805,8 +842,15 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); - + if (sbi->write_io_dummy) + mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); kfree(sbi); } @@ -817,6 +861,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -851,12 +898,55 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; + u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -867,19 +957,67 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi); + buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + sbi->reserved_blocks; - buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = min(buf->f_files - valid_node_count(sbi), - buf->f_bavail); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_bavail); + } buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -897,7 +1035,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -905,6 +1045,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nouser_xattr"); if (test_opt(sbi, INLINE_XATTR)) seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -943,87 +1085,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",active_logs=%u", sbi->active_logs); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); +#endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); +#endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } -static int segment_info_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i; - - seq_puts(seq, "format: segment_type|valid_blocks\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - if ((i % 10) == 0) - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); - if ((i % 10) == 9 || i == (total_segs - 1)) - seq_putc(seq, '\n'); - else - seq_putc(seq, ' '); - } - - return 0; -} - -static int segment_bits_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i, j; - - seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); - for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, " %.2x", se->cur_valid_map[j]); - seq_putc(seq, '\n'); - } - return 0; -} - -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .owner = THIS_MODULE, \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); - static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { @@ -1049,6 +1141,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; @@ -1056,14 +1149,37 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we * need to restore them. */ org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -1073,7 +1189,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sbi->mount_opt.opt = 0; default_options(sbi); /* parse mount options */ @@ -1088,6 +1203,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else { + /* dquot_resume needs RW */ + sb->s_flags &= ~MS_RDONLY; + dquot_resume(sb, -1); + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1136,6 +1261,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -1150,21 +1280,289 @@ restore_gc: stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; + sb->s_flags = old_sb_flags; #ifdef CONFIG_F2FS_FAULT_INJECTION sbi->fault_info = ffi; #endif return err; } -static struct super_operations f2fs_sops = { +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + char *kaddr; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_mapping_page(mapping, blkidx, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return -EIO; + } + + kaddr = kmap_atomic(page); + memcpy(data, kaddr + offset, tocopy); + kunmap_atomic(kaddr); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + char *kaddr; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); + + err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + &page, NULL); + if (unlikely(err)) + break; + + kaddr = kmap_atomic(page); + memcpy(kaddr + offset, data, tocopy); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, NULL); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return 0; + inode->i_version++; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + + ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + if (ret) + return ret; + + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); + } + return 0; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + struct inode *inode; + int err; + + err = f2fs_quota_sync(sb, type); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + f2fs_quota_sync(sb, type); + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + f2fs_quota_off(sb, type); +} + +#if 0 +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} +#endif + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +#if 0 + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +#endif +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; +#else +void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, @@ -1182,12 +1580,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } -static int f2fs_key_prefix(struct inode *inode, u8 **key) -{ - *key = F2FS_I_SB(inode)->key_prefix; - return F2FS_I_SB(inode)->key_prefix_size; -} - static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -1202,16 +1594,16 @@ static unsigned f2fs_max_namelen(struct inode *inode) inode->i_sb->s_blocksize : F2FS_NAME_LEN; } -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", .get_context = f2fs_get_context, - .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; #else -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .is_encrypted = f2fs_encrypted_inode, }; #endif @@ -1263,9 +1655,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); @@ -1467,6 +1866,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; @@ -1480,6 +1886,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1501,6 +1909,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -1511,7 +1933,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; + int i, j; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1539,17 +1961,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - mutex_init(&sbi->wio_mutex[NODE]); - mutex_init(&sbi->wio_mutex[DATA]); + for (i = 0; i < NR_PAGE_TYPE - 1; i++) + for (j = HOT; j < NR_TEMP_TYPE; j++) + mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; -#endif } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -1579,16 +1998,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); @@ -1724,36 +2143,59 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; int i; - for (i = 0; i < MAX_DEVICES; i++) { - if (!RDEV(i).path[0]) + /* Initialize single device information */ + if (!RDEV(0).path[0]) { +#ifdef CONFIG_BLK_DEV_ZONED + if (!bdev_is_zoned(sbi->sb->s_bdev)) return 0; + max_devices = 1; +#else + return 0; +#endif + } - if (i == 0) { - sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * - MAX_DEVICES, GFP_KERNEL); - if (!sbi->devs) - return -ENOMEM; - } + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; - memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); - FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); - if (i == 0) { - FDEV(i).start_blk = 0; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1 + - le32_to_cpu(raw_super->segment0_blkaddr); - } else { - FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1; - } + for (i = 0; i < max_devices; i++) { - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, + sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, sbi->sb->s_mode, sbi->sb->s_type); + } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); @@ -1773,6 +2215,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) "Failed to initialize F2FS blkzone information"); return -EINVAL; } + if (max_devices == 1) + break; f2fs_msg(sbi->sb, KERN_INFO, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", i, FDEV(i).path, @@ -1841,6 +2285,11 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware @@ -1850,6 +2299,7 @@ try_onemore: if (f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); + err = -EOPNOTSUPP; goto free_sb_buf; } #endif @@ -1871,6 +2321,12 @@ try_onemore: sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; +#endif + sb->s_op = &f2fs_sops; sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; @@ -1886,18 +2342,34 @@ try_onemore: mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->read_io.io_rwsem); - sbi->read_io.sbi = sbi; - sbi->read_io.bio = NULL; + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { - init_rwsem(&sbi->write_io[i].io_rwsem); - sbi->write_io[i].sbi = sbi; - sbi->write_io[i].bio = NULL; + int n = (i == META) ? 1: NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); + if (!sbi->write_io[i]) { + err = -ENOMEM; + goto free_options; + } + + for (j = HOT; j < n; j++) { + init_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + } } init_rwsem(&sbi->cp_rwsem); @@ -1910,9 +2382,11 @@ try_onemore: if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = - mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); - if (!sbi->write_io_dummy) + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); + if (!sbi->write_io_dummy) { + err = -ENOMEM; goto free_options; + } } /* get an inode for meta space */ @@ -1944,6 +2418,7 @@ try_onemore: sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -1991,10 +2466,9 @@ try_onemore: f2fs_join_shrinker(sbi); - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); + err = f2fs_build_stats(sbi); if (err) - goto free_node_inode; + goto free_nm; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -2015,26 +2489,14 @@ try_onemore: goto free_root_inode; } - err = f2fs_build_stats(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); if (err) - goto free_proc; + goto free_sysfs; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2045,7 +2507,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_kobj; + goto free_meta; } if (need_fsck) @@ -2059,7 +2521,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_kobj; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -2068,7 +2530,7 @@ try_onemore: err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_kobj; + goto free_sysfs; } } skip_recovery: @@ -2083,7 +2545,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_kobj; + goto free_meta; } kfree(options); @@ -2095,22 +2557,23 @@ skip_recovery: sbi->valid_super_block ? 1 : 2, err); } + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); return 0; -free_kobj: +free_meta: f2fs_sync_inode_meta(sbi); - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); -free_proc: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - f2fs_destroy_stats(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2119,15 +2582,9 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); - /* - * Some dirty meta pages can be produced by recover_orphan_inodes() - * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). - */ - truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: @@ -2141,7 +2598,13 @@ free_meta_inode: free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_options: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); @@ -2167,8 +2630,11 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { - if (sb->s_root) + if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + stop_gc_thread(F2FS_SB(sb)); + stop_discard_thread(F2FS_SB(sb)); + } kill_block_super(sb); } @@ -2222,30 +2688,26 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) { - err = -ENOMEM; + err = f2fs_init_sysfs(); + if (err) goto free_extent_cache; - } err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_kset; - + goto free_sysfs; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; err = f2fs_create_root_stats(); if (err) goto free_filesystem; - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_kset: - kset_unregister(f2fs_kset); +free_sysfs: + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2262,11 +2724,10 @@ fail: static void __exit exit_f2fs_fs(void) { - remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - kset_unregister(f2fs_kset); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000000..e2c258f717cd --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,556 @@ +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu <chao@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/proc_fs.h> +#include <linux/f2fs_fs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" + +static struct proc_dir_entry *f2fs_proc_root; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif + RESERVED_BLOCKS, +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; + int id; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif + return NULL; +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if ((unsigned long)sbi->total_valid_block_count + t > + (unsigned long)sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + spin_unlock(&sbi->stat_lock); + return count; + } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + + *ui = t; + return count; + } + + *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), + ATTR_LIST(reserved_blocks), + NULL, +}; + +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_sb_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, false)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_bits_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, false)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); + +int __init f2fs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; +} + +void f2fs_exit_sysfs(void) +{ + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; +} + +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); + } + return 0; +} + +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) +{ + if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); +} diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a..bccbbf2616d2 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1c4d5e39586c..ab658419552b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -264,18 +264,123 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } +static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, + void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + + list_for_each_xattr(entry, base_addr) { + if ((void *)entry + sizeof(__u32) > base_addr + inline_size || + (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > + base_addr + inline_size) { + *last_addr = entry; + return NULL; + } + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + void *cur_addr, *txattr_addr, *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; + unsigned int inline_size = inline_xattr_size(inode); + int err = 0; + + if (!size && !inline_size) + return -ENODATA; + + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + *xe = __find_inline_xattr(txattr_addr, &last_addr, + index, len, name); + if (*xe) + goto check; + } + + /* read from xattr node block */ + if (xnid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + goto out; + } + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, size); + f2fs_put_page(xpage, 1); + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, index, len, name); +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + kzfree(txattr_addr); + return err; +} + static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -299,19 +404,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -333,14 +438,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; @@ -386,7 +489,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); @@ -395,23 +498,20 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - - sizeof(struct node_footer)); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); - /* need to checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); return 0; } int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { - struct f2fs_xattr_entry *entry; - void *base_addr; + struct f2fs_xattr_entry *entry = NULL; int error = 0; - size_t size, len; + unsigned int size, len; + void *base_addr = NULL; if (name == NULL) return -EINVAL; @@ -420,21 +520,18 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - error = read_all_xattrs(inode, ipage, &base_addr); + down_read(&F2FS_I(inode)->i_xattr_sem); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; - entry = __find_xattr(base_addr, index, len, name); - if (IS_XATTR_LAST_ENTRY(entry)) { - error = -ENODATA; - goto cleanup; - } - size = le16_to_cpu(entry->e_value_size); if (buffer && size > buffer_size) { error = -ERANGE; - goto cleanup; + goto out; } if (buffer) { @@ -442,8 +539,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, memcpy(buffer, pval, size); } error = size; - -cleanup: +out: kzfree(base_addr); return error; } @@ -456,7 +552,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -485,6 +583,15 @@ cleanup: return error; } +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); +} + static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) @@ -519,12 +626,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - if ((flags & XATTR_REPLACE) && !found) { + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (f2fs_xattr_value_same(here, value, size)) + goto exit; + } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; - } else if ((flags & XATTR_CREATE) && found) { - error = -EEXIST; - goto exit; } last = here; @@ -618,7 +730,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d2fd0387a3c7..08a4840d6d7d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) @@ -72,9 +72,10 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) - -#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ - sizeof(struct node_footer) - sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) #define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ sizeof(struct f2fs_xattr_header) - \ |