From 0208fabf7256245125fbabf03207a0da4000ea2d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 24 Nov 2017 11:13:07 +0100 Subject: mm, hwpoison: fixup "mm: check the return value of lookup_page_ext for all call sites" Backport of the upstream commit f86e4271978b ("mm: check the return value of lookup_page_ext for all call sites") is wrong for hwpoison pages. I have accidentally negated the condition for bailout. This basically disables hwpoison pages tracking while the code still might crash on unusual configurations when struct pages do not have page_ext allocated. The fix is trivial to invert the condition. Reported-by: Jiri Slaby Signed-off-by: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- mm/debug-pagealloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index fe1c61f7cf26..3b8f1b83610e 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c @@ -34,7 +34,7 @@ static inline void set_page_poison(struct page *page) struct page_ext *page_ext; page_ext = lookup_page_ext(page); - if (page_ext) + if (!page_ext) return; __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -44,7 +44,7 @@ static inline void clear_page_poison(struct page *page) struct page_ext *page_ext; page_ext = lookup_page_ext(page); - if (page_ext) + if (!page_ext) return; __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -54,7 +54,7 @@ static inline bool page_poison(struct page *page) struct page_ext *page_ext; page_ext = lookup_page_ext(page); - if (page_ext) + if (!page_ext) return false; return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } -- cgit v1.2.3 From 29ffb9c1fb4acbda207985ad1558191ffb776bee Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 24 Nov 2017 11:26:29 +0100 Subject: Linux 4.4.102 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0d7b050427ed..9e036fac9c04 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 101 +SUBLEVEL = 102 EXTRAVERSION = NAME = Blurry Fish Butt -- cgit v1.2.3 From 663d2e5444a6791c41d690b02c2ae027b1cdb062 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Sep 2017 10:56:13 +0200 Subject: UPSTREAM: android: binder: fix type mismatch warning Allowing binder to expose the 64-bit API on 32-bit kernels caused a build warning: drivers/android/binder.c: In function 'binder_transaction_buffer_release': drivers/android/binder.c:2220:15: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] fd_array = (u32 *)(parent_buffer + fda->parent_offset); ^ drivers/android/binder.c: In function 'binder_translate_fd_array': drivers/android/binder.c:2445:13: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] fd_array = (u32 *)(parent_buffer + fda->parent_offset); ^ drivers/android/binder.c: In function 'binder_fixup_parent': drivers/android/binder.c:2511:18: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast] This adds extra type casts to avoid the warning. However, there is another problem with the Kconfig option: turning it on or off creates two incompatible ABI versions, a kernel that has this enabled cannot run user space that was built without it or vice versa. A better solution might be to leave the option hidden until the binder code is fixed to deal with both ABI versions. Fixes: e8d2ed7db7c3 ("Revert "staging: Fix build issues with new binder API"") Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 1c363eaece2752c5f8b1b874cb4ae435de06aa66) Change-Id: Id09185a6f86905926699e92a2b30201b8a5e83e5 --- drivers/android/binder.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 008e448536de..d559596988d7 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2468,7 +2468,7 @@ static void binder_transaction_buffer_release(struct binder_proc *proc, debug_id, (u64)fda->num_fds); continue; } - fd_array = (u32 *)(parent_buffer + fda->parent_offset); + fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); for (fd_index = 0; fd_index < fda->num_fds; fd_index++) task_close_fd(proc, fd_array[fd_index]); } break; @@ -2692,7 +2692,7 @@ static int binder_translate_fd_array(struct binder_fd_array_object *fda, */ parent_buffer = parent->buffer - binder_alloc_get_user_buffer_offset(&target_proc->alloc); - fd_array = (u32 *)(parent_buffer + fda->parent_offset); + fd_array = (u32 *)(parent_buffer + (uintptr_t)fda->parent_offset); if (!IS_ALIGNED((unsigned long)fd_array, sizeof(u32))) { binder_user_error("%d:%d parent offset not aligned correctly.\n", proc->pid, thread->pid); @@ -2758,7 +2758,7 @@ static int binder_fixup_parent(struct binder_transaction *t, proc->pid, thread->pid); return -EINVAL; } - parent_buffer = (u8 *)(parent->buffer - + parent_buffer = (u8 *)((uintptr_t)parent->buffer - binder_alloc_get_user_buffer_offset( &target_proc->alloc)); *(binder_uintptr_t *)(parent_buffer + bp->parent_offset) = bp->buffer; -- cgit v1.2.3 From 353c16247d72937c90e7ef04cbd603502581019a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 23 Sep 2017 17:02:18 +0800 Subject: f2fs: updates on 4.15-rc1 Pull f2fs updates from Jaegeuk Kim: "In this round, we introduce sysfile-based quota support which is required for Android by default. In addition, we allow that users are able to reserve some blocks in runtime to mitigate performance drops in low free space. Enhancements: - assign proper data segments according to write_hints given by user - issue cache_flush on dirty devices only among multiple devices - exploit cp_error flag and add more faults to enhance fault injection test - conduct more readaheads during f2fs_readdir - add a range for discard commands Bug fixes: - fix zero stat->st_blocks when inline_data is set - drop crypto key and free stale memory pointer while evict_inode is failing - fix some corner cases in free space and segment management - fix wrong last_disk_size This series includes lots of clean-ups and code enhancement in terms of xattr operations, discard/flush command control. In addition, it adds versatile debugfs entries to monitor f2fs status" Cherry-picked from origin/upstream-f2fs-stable-linux-4.4.y: 56a07b070510 f2fs: deny accessing encryption policy if encryption is off c394842e26e5 f2fs: inject fault in inc_valid_node_count 926292251022 f2fs: fix to clear FI_NO_PREALLOC e6cfc5de2d05 f2fs: expose quota information in debugfs c4cd2efe835b f2fs: separate nat entry mem alloc from nat_tree_lock 48c72b4c8c50 f2fs: validate before set/clear free nat bitmap baf9275a4bbd f2fs: avoid opened loop codes in __add_ino_entry 47af6c72d944 f2fs: apply write hints to select the type of segments for buffered write ac9819160586 f2fs: introduce scan_curseg_cache for cleanup ca28e9670e80 f2fs: optimize the way of traversing free_nid_bitmap 460688b59e8b f2fs: keep scanning until enough free nids are acquired 0186182c0c4d f2fs: trace checkpoint reason in fsync() 5d4b6efcfd09 f2fs: keep isize once block is reserved cross EOF 3c8f767e1374 f2fs: avoid race in between GC and block exchange 4423778adf0e f2fs: save a multiplication for last_nid calculation 3e3b40557525 f2fs: fix summary info corruption 44889e487981 f2fs: remove dead code in update_meta_page 55c7b9595bb9 f2fs: remove unneeded semicolon 8b92814117d5 f2fs: don't bother with inode->i_version 42c7c71824fc f2fs: check curseg space before foreground GC c5470498e59b f2fs: use rw_semaphore to protect SIT cache 82750d346ab7 f2fs: support quota sys files 26dfec49b25a f2fs: add quota_ino feature infra ddb8e2ae9811 f2fs: optimize __update_nat_bits f46ae958c701 f2fs: modify for accurate fggc node io stat c713fdb5a23c Revert "f2fs: handle dirty segments inside refresh_sit_entry" 873ec505cb07 f2fs: add a function to move nid ae66786296b4 f2fs: export SSR allocation threshold 90c28a18d2a4 f2fs: give correct trimmed blocks in fstrim 5612922fb0ac f2fs: support bio allocation error injection 583b7a274c27 f2fs: support get_page error injection 09a073cc8c56 f2fs: add missing sysfs description e945474a9c1b f2fs: support soft block reservation b7b2e629b6f6 f2fs: handle error case when adding xattr entry 7368e30495c5 f2fs: support flexible inline xattr size ada4061e191b f2fs: show current cp state 5b8ff1301a61 f2fs: add missing quota_initialize 46d4a691f035 f2fs: show # of dirty segments via sysfs fc13f9d7ce1e f2fs: stop all the operations by cp_error flag 91bea0c391b3 f2fs: remove several redundant assignments 807486c79534 f2fs: avoid using timespec 03b1cb0bb4a2 f2fs: fix to correct no_fggc_candidate 5c15033ceaea Revert "f2fs: return wrong error number on f2fs_quota_write" 5f5f59322240 f2fs: remove obsolete pointer for truncate_xattr_node 032a6906825a f2fs: retry ENOMEM for quota_read|write 171b638fc49b f2fs: limit # of inmemory pages 83ed7a615f0a f2fs: update ctx->pos correctly when hitting hole in directory 4d6e68be2534 f2fs: relocate readahead codes in readdir() c8be47b54018 f2fs: allow readdir() to be interrupted 2b903fe94cd0 f2fs: trace f2fs_readdir bb0db666d4bc f2fs: trace f2fs_lookup 40d6250f046a f2fs: skip searching non-exist range in truncate_hole 8e84f379df61 f2fs: expose some sectors to user in inline data or dentry case cb98f70dea02 f2fs: avoid stale fi->gdirty_list pointer 5562a3c53963 f2fs/crypto: drop crypto key at evict_inode only 85853e7e38d7 f2fs: fix to avoid race when accessing last_disk_size 0c47a892d555 f2fs: Fix bool initialization/comparison 68e801abc520 f2fs: give up CP_TRIMMED_FLAG if it drops discards df74eacb2075 f2fs: trace f2fs_remove_discard bd502c6e3e7a f2fs: reduce cmd_lock coverage in __issue_discard_cmd a34ab5ca4f94 f2fs: split discard policy 1e65afd14d32 f2fs: wrap discard policy 684447dad138 f2fs: support issuing/waiting discard in range 27eaad09380f f2fs: fix to flush multiple device in checkpoint 08bb9d68d51b f2fs: enhance multiple device flush 9c2526ac2ecb f2fs: fix to show ino management cache size correctly 814b463d262f f2fs: drop FI_UPDATE_WRITE tag after f2fs_issue_flush f555b0a117d3 f2fs: obsolete ALLOC_NID_LIST list 75d3164ae128 f2fs: convert inline data for direct I/O & FI_NO_PREALLOC 4de0ceb6b7ef f2fs: allow readpages with NULL file pointer 322a45d17212 f2fs: show flush list status in sysfs 6d625a93b4a8 f2fs: introduce read_xattr_block 8ea6e1c327c5 f2fs: introduce read_inline_xattr dbce11e9ee5b Revert "f2fs: reuse nids more aggressively" 131bc9f6b7f9 Revert "f2fs: node segment is prior to data segment selected victim" Change-Id: I93b9cd867b859a667a448b39299ff44a2b841b8c Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 43 ++- fs/f2fs/acl.c | 3 + fs/f2fs/checkpoint.c | 64 +++- fs/f2fs/data.c | 38 ++- fs/f2fs/debug.c | 31 +- fs/f2fs/dir.c | 32 +- fs/f2fs/f2fs.h | 223 +++++++++++--- fs/f2fs/file.c | 120 ++++++-- fs/f2fs/gc.c | 37 ++- fs/f2fs/inline.c | 1 + fs/f2fs/inode.c | 26 +- fs/f2fs/namei.c | 101 +++++-- fs/f2fs/node.c | 410 +++++++++++++------------ fs/f2fs/node.h | 16 +- fs/f2fs/recovery.c | 8 +- fs/f2fs/segment.c | 512 ++++++++++++++++++++++++-------- fs/f2fs/segment.h | 39 ++- fs/f2fs/shrinker.c | 2 +- fs/f2fs/super.c | 219 ++++++++++++-- fs/f2fs/sysfs.c | 53 +++- fs/f2fs/xattr.c | 174 ++++++----- include/linux/f2fs_fs.h | 10 +- include/trace/events/f2fs.h | 116 +++++++- 23 files changed, 1655 insertions(+), 623 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 500c60403653..2baed1151eac 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -51,6 +51,18 @@ Description: Controls the dirty page count condition for the in-place-update policies. +What: /sys/fs/f2fs//min_hot_blocks +Date: March 2017 +Contact: "Jaegeuk Kim" +Description: + Controls the dirty page count condition for redefining hot data. + +What: /sys/fs/f2fs//min_ssr_sections +Date: October 2017 +Contact: "Chao Yu" +Description: + Controls the fee section threshold to trigger SSR allocation. + What: /sys/fs/f2fs//max_small_discards Date: November 2013 Contact: "Jaegeuk Kim" @@ -96,6 +108,18 @@ Contact: "Jaegeuk Kim" Description: Controls the checkpoint timing. +What: /sys/fs/f2fs//idle_interval +Date: January 2016 +Contact: "Jaegeuk Kim" +Description: + Controls the idle timing. + +What: /sys/fs/f2fs//iostat_enable +Date: August 2017 +Contact: "Chao Yu" +Description: + Controls to enable/disable IO stat. + What: /sys/fs/f2fs//ra_nid_pages Date: October 2015 Contact: "Chao Yu" @@ -116,6 +140,12 @@ Contact: "Shuoran Liu" Description: Shows total written kbytes issued to disk. +What: /sys/fs/f2fs//feature +Date: July 2017 +Contact: "Jaegeuk Kim" +Description: + Shows all enabled features in current device. + What: /sys/fs/f2fs//inject_rate Date: May 2016 Contact: "Sheng Yong" @@ -132,7 +162,18 @@ What: /sys/fs/f2fs//reserved_blocks Date: June 2017 Contact: "Chao Yu" Description: - Controls current reserved blocks in system. + Controls target reserved blocks in system, the threshold + is soft, it could exceed current available user space. + +What: /sys/fs/f2fs//current_reserved_blocks +Date: October 2017 +Contact: "Yunlong Song" +Contact: "Chao Yu" +Description: + Shows current reserved blocks in system, it may be temporarily + smaller than target_reserved_blocks, but will gradually + increase to target_reserved_blocks when more free blocks are + freed by user later. What: /sys/fs/f2fs//gc_urgent Date: August 2017 diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 112f8e04c549..3f52efa0f94f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -253,6 +253,9 @@ static int __f2fs_set_acl(struct inode *inode, int type, int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + return __f2fs_set_acl(inode, type, acl, NULL); } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e86f67ac96c6..2eb778174a9b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { set_ckpt_flags(sbi, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; if (!end_io) f2fs_flush_merged_writes(sbi); } @@ -402,24 +401,23 @@ const struct address_space_operations f2fs_meta_aops = { #endif }; -static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) +static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e, *tmp; tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); -retry: + radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { e = tmp; - if (radix_tree_insert(&im->ino_root, ino, e)) { - spin_unlock(&im->ino_lock); - radix_tree_preload_end(); - goto retry; - } + if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) + f2fs_bug_on(sbi, 1); + memset(e, 0, sizeof(struct ino_entry)); e->ino = ino; @@ -427,6 +425,10 @@ retry: if (type != ORPHAN_INO) im->ino_num++; } + + if (type == FLUSH_INO) + f2fs_set_bit(devidx, (char *)&e->dirty_device); + spin_unlock(&im->ino_lock); radix_tree_preload_end(); @@ -455,7 +457,7 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ - __add_ino_entry(sbi, ino, type); + __add_ino_entry(sbi, ino, 0, type); } void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -481,7 +483,7 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) struct ino_entry *e, *tmp; int i; - for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -495,6 +497,27 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all) } } +void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + __add_ino_entry(sbi, ino, devidx, type); +} + +bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type) +{ + struct inode_management *im = &sbi->im[type]; + struct ino_entry *e; + bool is_dirty = false; + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) + is_dirty = true; + spin_unlock(&im->ino_lock); + return is_dirty; +} + int acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; @@ -531,7 +554,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); update_inode_page(inode); } @@ -555,7 +578,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return err; } - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(sbi, ino, 0, ORPHAN_INO); inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { @@ -591,6 +614,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; unsigned int s_flags = sbi->sb->s_flags; int err = 0; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; @@ -603,8 +629,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ - f2fs_enable_quota_files(sbi); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); #endif start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -632,7 +659,8 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) out: #ifdef CONFIG_QUOTA /* Turn quotas off */ - f2fs_quota_off_umount(sbi->sb); + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); #endif sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -987,7 +1015,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) update_inode_page(inode); iput(inode); } - }; + } return 0; } @@ -1147,6 +1175,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct super_block *sb = sbi->sb; struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; + int err; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { @@ -1240,6 +1269,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; + /* flush all device cache */ + err = f2fs_flush_device_cache(sbi); + if (err) + return err; + /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c8583d7a1845..cdccc429325b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -172,7 +172,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, { struct bio *bio; - bio = f2fs_bio_alloc(npages); + bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; @@ -417,8 +417,8 @@ next: bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - /* set submitted = 1 as a return value */ - fio->submitted = 1; + /* set submitted = true as a return value */ + fio->submitted = true; inc_page_count(sbi, WB_DATA_TYPE(bio_page)); @@ -472,7 +472,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, f2fs_wait_on_block_writeback(sbi, blkaddr); } - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); if (!bio) { if (ctx) fscrypt_release_ctx(ctx); @@ -832,6 +832,13 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + /* convert inline data for Direct I/O*/ + if (iocb->ki_flags & IOCB_DIRECT) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; @@ -844,15 +851,11 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; - if (iocb->ki_flags & IOCB_DIRECT) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; + if (iocb->ki_flags & IOCB_DIRECT) return f2fs_map_blocks(inode, &map, 1, __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); - } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) @@ -1332,7 +1335,7 @@ static int f2fs_read_data_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct inode *inode = file->f_mapping->host; + struct inode *inode = mapping->host; struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); @@ -1493,6 +1496,7 @@ static int __write_data_page(struct page *page, bool *submitted, int err = 0; struct f2fs_io_info fio = { .sbi = sbi, + .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), @@ -1564,8 +1568,11 @@ write: err = do_write_data_page(&fio); } } + + down_write(&F2FS_I(inode)->i_sem); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; + up_write(&F2FS_I(inode)->i_sem); done: if (err && err != -ENOENT) @@ -1945,6 +1952,12 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, } trace_f2fs_write_begin(inode, pos, len, flags); + if (f2fs_is_atomic_file(inode) && + !available_free_memory(sbi, INMEM_PAGES)) { + err = -ENOMEM; + goto fail; + } + /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1960,7 +1973,8 @@ repeat: * Do not use grab_cache_page_write_begin() to avoid deadlock due to * wait_for_stable_page. Will wait that below with our IO control. */ - page = grab_cache_page(mapping, index); + page = f2fs_pagecache_get_page(mapping, index, + FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); if (!page) { err = -ENOMEM; goto fail; @@ -2021,6 +2035,8 @@ repeat: fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); + if (f2fs_is_atomic_file(inode)) + drop_inmem_pages_all(sbi); return err; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 87f449845f5f..ecada8425268 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -45,9 +45,18 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA); si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + + si->nquota_files = 0; + if (f2fs_sb_has_quota_ino(sbi->sb)) { + for (i = 0; i < MAXQUOTAS; i++) { + if (f2fs_qf_ino(sbi->sb, i)) + si->nquota_files++; + } + } si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); @@ -61,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) atomic_read(&SM_I(sbi)->fcc_info->issued_flush); si->nr_flushing = atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + si->flush_list_empty = + llist_empty(&SM_I(sbi)->fcc_info->issue_list); } if (SM_I(sbi) && SM_I(sbi)->dcc_info) { si->nr_discarded = @@ -96,9 +107,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; si->avail_nids = NM_I(sbi)->available_nids; - si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -231,14 +242,14 @@ get_cache: } /* free nids */ - si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + - NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - for (i = 0; i <= ORPHAN_INO; i++) + for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree); @@ -262,9 +273,10 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW"); + f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_cp_error(si->sbi) ? "Error": "Good"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", @@ -349,10 +361,11 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, + si->flush_list_empty, si->nr_discarding, si->nr_discarded, si->nr_discard_cmd, si->undiscard_blks); seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " @@ -365,6 +378,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); + seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + si->ndirty_qdata, si->nquota_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - imeta: %4d\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4f2a8fedb313..1955707b138b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -10,10 +10,12 @@ */ #include #include +#include #include "f2fs.h" #include "node.h" #include "acl.h" #include "xattr.h" +#include static unsigned long dir_blocks(struct inode *inode) { @@ -847,6 +849,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct f2fs_dentry_block *dentry_blk = NULL; struct page *dentry_page = NULL; struct file_ra_state *ra = &file->f_ra; + loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); @@ -855,24 +858,32 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) if (f2fs_encrypted_inode(inode)) { err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) - return err; + goto out; err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); if (err < 0) - return err; + goto out; } if (f2fs_has_inline_dentry(inode)) { err = f2fs_read_inline_dir(file, ctx, &fstr); - goto out; + goto out_free; } - /* readahead for multi pages of dir */ - if (npages - n > 1 && !ra_has_index(ra, n)) - page_cache_sync_readahead(inode->i_mapping, ra, file, n, + for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) { + + /* allow readdir() to be interrupted */ + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto out_free; + } + cond_resched(); + + /* readahead for multi pages of dir */ + if (npages - n > 1 && !ra_has_index(ra, n)) + page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); @@ -880,7 +891,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = 0; continue; } else { - goto out; + goto out_free; } } @@ -896,12 +907,13 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) break; } - ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } -out: +out_free: fscrypt_fname_free_buffer(&fstr); +out: + trace_f2fs_readdir(inode, start_pos, ctx->pos, err); return err < 0 ? err : 0; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c1a0aef8efc6..081ec493baae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -47,6 +47,8 @@ enum { FAULT_KMALLOC, FAULT_PAGE_ALLOC, + FAULT_PAGE_GET, + FAULT_ALLOC_BIO, FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -94,6 +96,7 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 #define F2FS_MOUNT_QUOTA 0x00400000 +#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -119,6 +122,8 @@ struct f2fs_mount_info { #define F2FS_FEATURE_EXTRA_ATTR 0x0008 #define F2FS_FEATURE_PRJQUOTA 0x0010 #define F2FS_FEATURE_INODE_CHKSUM 0x0020 +#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040 +#define F2FS_FEATURE_QUOTA_INO 0x0080 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -214,7 +219,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) -#define DISCARD_ISSUE_RATE 8 +#define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ @@ -225,7 +230,6 @@ struct cp_control { __u64 trim_start; __u64 trim_end; __u64 trim_minlen; - __u64 trimmed; }; /* @@ -244,12 +248,14 @@ enum { ORPHAN_INO, /* for orphan ino list */ APPEND_INO, /* for append ino list */ UPDATE_INO, /* for update ino list */ + FLUSH_INO, /* for multiple device flushing */ MAX_INO_ENTRY, /* max. list */ }; struct ino_entry { - struct list_head list; /* list head */ - nid_t ino; /* inode number */ + struct list_head list; /* list head */ + nid_t ino; /* inode number */ + unsigned int dirty_device; /* dirty device bitmap */ }; /* for the list of inodes to be GCed */ @@ -273,10 +279,6 @@ struct discard_entry { #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) -#define P_ACTIVE 0x01 -#define P_TRIM 0x02 -#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) - enum { D_PREP, D_SUBMIT, @@ -308,12 +310,32 @@ struct discard_cmd { int error; /* bio error */ }; +enum { + DPOLICY_BG, + DPOLICY_FORCE, + DPOLICY_FSTRIM, + DPOLICY_UMOUNT, + MAX_DPOLICY, +}; + +struct discard_policy { + int type; /* type of discard */ + unsigned int min_interval; /* used for candidates exist */ + unsigned int max_interval; /* used for candidates not exist */ + unsigned int max_requests; /* # of discards issued per round */ + unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ + bool io_aware; /* issue discard in idle time */ + bool sync; /* submit discard with REQ_SYNC flag */ + unsigned int granularity; /* discard granularity */ +}; + struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ + struct list_head fstrim_list; /* in-flight discard from fstrim */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; @@ -443,11 +465,14 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 +#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); -#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ - (CUR_ADDRS_PER_INODE(inode) - \ - DEF_INLINE_RESERVED_SIZE - \ - F2FS_INLINE_XATTR_ADDRS)) +static inline int get_inline_xattr_addrs(struct inode *inode); +#define F2FS_INLINE_XATTR_ADDRS(inode) get_inline_xattr_addrs(inode) +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + F2FS_INLINE_XATTR_ADDRS(inode) - \ + DEF_INLINE_RESERVED_SIZE)) /* for inline dir */ #define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ @@ -647,6 +672,7 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ + struct list_head inmem_ilist; /* list for inmem inodes */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ @@ -657,6 +683,7 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ + int i_inline_xattr_size; /* inline xattr size */ }; static inline void get_extent_info(struct extent_info *ext, @@ -730,10 +757,13 @@ static inline void __try_update_largest_extent(struct inode *inode, } } -enum nid_list { - FREE_NID_LIST, - ALLOC_NID_LIST, - MAX_NID_LIST, +/* + * For free nid management + */ +enum nid_state { + FREE_NID, /* newly added to free nid list */ + PREALLOC_NID, /* it is preallocated */ + MAX_NID_STATE, }; struct f2fs_nm_info { @@ -756,8 +786,8 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ - unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + struct list_head free_nid_list; /* list for free nids excluding preallocated nids */ + unsigned int nid_cnt[MAX_NID_STATE]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; @@ -835,6 +865,7 @@ enum { struct flush_cmd { struct completion wait; struct llist_node llnode; + nid_t ino; int ret; }; @@ -853,6 +884,8 @@ struct f2fs_sm_info { struct dirty_seglist_info *dirty_info; /* dirty segment information */ struct curseg_info *curseg_array; /* active segment information */ + struct rw_semaphore curseg_lock; /* for preventing curseg change */ + block_t seg0_blkaddr; /* block address of 0'th segment */ block_t main_blkaddr; /* start block address of main area */ block_t ssa_blkaddr; /* start block address of SSA area */ @@ -874,6 +907,7 @@ struct f2fs_sm_info { unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ + unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -895,6 +929,7 @@ struct f2fs_sm_info { enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, + F2FS_DIRTY_QDATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, @@ -943,6 +978,18 @@ enum need_lock_type { LOCK_RETRY, }; +enum cp_reason_type { + CP_NO_NEEDED, + CP_NON_REGULAR, + CP_HARDLINK, + CP_SB_NEED_CP, + CP_WRONG_PINO, + CP_NO_SPC_ROLL, + CP_NODE_NEED_CP, + CP_FASTBOOT_MODE, + CP_SPEC_LOG_NUM, +}; + enum iostat_type { APP_DIRECT_IO, /* app direct IOs */ APP_BUFFERED_IO, /* app buffered IOs */ @@ -962,6 +1009,7 @@ enum iostat_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ + nid_t ino; /* inode number */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ @@ -1006,6 +1054,7 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ + ATOMIC_FILE, /* for all atomic files */ NR_INODE_TYPE, }; @@ -1108,12 +1157,15 @@ struct f2fs_sb_info { loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ + int inline_xattr_size; /* inline xattr size */ + unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ block_t reserved_blocks; /* configurable reserved blocks */ + block_t current_reserved_blocks; /* current reserved blocks */ u32 s_next_generation; /* for NFS support */ @@ -1179,6 +1231,8 @@ struct f2fs_sb_info { struct list_head s_list; int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ + unsigned int dirty_device; /* for checkpoint data flush */ + spinlock_t dev_lock; /* protect dirty_device */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -1242,8 +1296,7 @@ static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) { - struct timespec ts = {sbi->interval_time[type], 0}; - unsigned long interval = timespec_to_jiffies(&ts); + unsigned long interval = sbi->interval_time[type] * HZ; return time_after(jiffies, sbi->last_time[type] + interval); } @@ -1410,6 +1463,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type) +{ + if (type < F2FS_MAX_QUOTAS) + return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]); + return 0; +} + static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) { size_t crc_offset = le32_to_cpu(cp->checksum_offset); @@ -1588,7 +1648,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; @@ -1622,6 +1683,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); f2fs_i_blocks_write(inode, count, false, true); } @@ -1642,6 +1707,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode) atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1658,6 +1725,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode) atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); + if (IS_NOQUOTA(inode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA); } static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -1765,10 +1834,17 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return ret; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + goto enospc; + } +#endif + spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count + sbi->reserved_blocks > + if (unlikely(valid_block_count + sbi->current_reserved_blocks > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; @@ -1811,6 +1887,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, sbi->total_valid_node_count--; sbi->total_valid_block_count--; + if (sbi->reserved_blocks && + sbi->current_reserved_blocks < sbi->reserved_blocks) + sbi->current_reserved_blocks++; spin_unlock(&sbi->stat_lock); @@ -1857,6 +1936,19 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); } +static inline struct page *f2fs_pagecache_get_page( + struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp_mask) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { + f2fs_show_injection_info(FAULT_PAGE_GET); + return NULL; + } +#endif + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); +} + static inline void f2fs_copy_page(struct page *src, struct page *dst) { char *src_kaddr = kmap(src); @@ -1906,15 +1998,25 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } -static inline struct bio *f2fs_bio_alloc(int npages) +static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, + int npages, bool no_fail) { struct bio *bio; - /* No failure on bio allocation */ - bio = bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; + if (no_fail) { + /* No failure on bio allocation */ + bio = bio_alloc(GFP_NOIO, npages); + if (!bio) + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); + return bio; + } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { + f2fs_show_injection_info(FAULT_ALLOC_BIO); + return NULL; + } +#endif + return bio_alloc(GFP_KERNEL, npages); } static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, @@ -2224,25 +2326,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { - if (f2fs_has_inline_xattr(inode)) - return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; - return CUR_ADDRS_PER_INODE(inode); + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode); } -static inline void *inline_xattr_addr(struct page *page) +static inline void *inline_xattr_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - - F2FS_INLINE_XATTR_ADDRS]); + F2FS_INLINE_XATTR_ADDRS(inode)]); } static inline int inline_xattr_size(struct inode *inode) { - if (f2fs_has_inline_xattr(inode)) - return F2FS_INLINE_XATTR_ADDRS << 2; - else - return 0; + return get_inline_xattr_addrs(inode) * sizeof(__le32); } static inline int f2fs_has_inline_data(struct inode *inode) @@ -2323,9 +2420,10 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + bool ret; + if (dsync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; spin_lock(&sbi->inode_lock[DIRTY_META]); ret = list_empty(&F2FS_I(inode)->gdirty_list); @@ -2336,9 +2434,15 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) file_keep_isize(inode) || i_size_read(inode) & PAGE_MASK) return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); + + down_read(&F2FS_I(inode)->i_sem); + ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); + up_read(&F2FS_I(inode)->i_sem); + + return ret; } +#define sb_rdonly f2fs_readonly static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; @@ -2406,6 +2510,12 @@ static inline int get_extra_isize(struct inode *inode) return F2FS_I(inode)->i_extra_isize / sizeof(__le32); } +static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb); +static inline int get_inline_xattr_addrs(struct inode *inode) +{ + return F2FS_I(inode)->i_inline_xattr_size; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -2534,7 +2644,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); -void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@ -2562,7 +2672,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int truncate_inode_blocks(struct inode *inode, pgoff_t from); -int truncate_xattr_node(struct inode *inode, struct page *page); +int truncate_xattr_node(struct inode *inode); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); @@ -2597,19 +2707,22 @@ void destroy_node_manager_caches(void); */ bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages_all(struct f2fs_sb_info *sbi); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); -int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int create_flush_cmd_control(struct f2fs_sb_info *sbi); +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); -void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void init_discard_policy(struct discard_policy *dpolicy, int discard_type, + unsigned int granularity); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); @@ -2664,6 +2777,10 @@ void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); +bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, + unsigned int devidx, int type); int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); int acquire_orphan_inode(struct f2fs_sb_info *sbi); void release_orphan_inode(struct f2fs_sb_info *sbi); @@ -2751,14 +2868,16 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; + int ndirty_data, ndirty_qdata; int inmem_pages; - unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; - int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_flushing, nr_flushed, flush_list_empty; + int nr_discarding, nr_discarded; int nr_discard_cmd; unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; @@ -3066,6 +3185,16 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); } +static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); +} + +static inline int f2fs_sb_has_quota_ino(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a9e1655a6bf8..bfff53f658e1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -56,6 +56,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto err; + } + sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -117,6 +122,7 @@ out_sem: out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); +err: return block_page_mkwrite_return(err); } @@ -141,27 +147,29 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) return 1; } -static inline bool need_do_checkpoint(struct inode *inode) +static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool need_cp = false; + enum cp_reason_type cp_reason = CP_NO_NEEDED; - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) - need_cp = true; + if (!S_ISREG(inode->i_mode)) + cp_reason = CP_NON_REGULAR; + else if (inode->i_nlink != 1) + cp_reason = CP_HARDLINK; else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) - need_cp = true; + cp_reason = CP_SB_NEED_CP; else if (file_wrong_pino(inode)) - need_cp = true; + cp_reason = CP_WRONG_PINO; else if (!space_for_roll_forward(sbi)) - need_cp = true; + cp_reason = CP_NO_SPC_ROLL; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) - need_cp = true; + cp_reason = CP_NODE_NEED_CP; else if (test_opt(sbi, FASTBOOT)) - need_cp = true; + cp_reason = CP_FASTBOOT_MODE; else if (sbi->active_logs == 2) - need_cp = true; + cp_reason = CP_SPEC_LOG_NUM; - return need_cp; + return cp_reason; } static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) @@ -196,7 +204,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; - bool need_cp = false; + enum cp_reason_type cp_reason = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, @@ -215,7 +223,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, clear_inode_flag(inode, FI_NEED_IPU); if (ret) { - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } @@ -246,10 +254,10 @@ go_write: * sudden-power-off. */ down_read(&F2FS_I(inode)->i_sem); - need_cp = need_do_checkpoint(inode); + cp_reason = need_do_checkpoint(inode); up_read(&F2FS_I(inode)->i_sem); - if (need_cp) { + if (cp_reason) { /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); @@ -297,19 +305,24 @@ sync_nodes: remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - remove_ino_entry(sbi, ino, UPDATE_INO); - clear_inode_flag(inode, FI_UPDATE_WRITE); if (!atomic) - ret = f2fs_issue_flush(sbi); + ret = f2fs_issue_flush(sbi, inode->i_ino); + if (!ret) { + remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); + remove_ino_entry(sbi, ino, FLUSH_INO); + } f2fs_update_time(sbi, REQ_TIME); out: - trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file))))) + return -EIO; return f2fs_do_sync_file(file, start, end, datasync, false); } @@ -446,6 +459,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -632,6 +648,9 @@ int f2fs_truncate(struct inode *inode) { int err; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return 0; @@ -667,7 +686,8 @@ int f2fs_getattr(struct vfsmount *mnt, generic_fillattr(inode, stat); /* we need to show initial sectors used for inline_data/dentries */ - if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || + f2fs_has_inline_dentry(inode)) stat->blocks += (stat->size + 511) >> 9; return 0; @@ -709,6 +729,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) int err; bool size_changed = false; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + err = inode_change_ok(inode, attr); if (err) return err; @@ -761,6 +784,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) inode->i_mtime = inode->i_ctime = current_time(inode); } + down_write(&F2FS_I(inode)->i_sem); + F2FS_I(inode)->last_disk_size = i_size_read(inode); + up_write(&F2FS_I(inode)->i_sem); + size_changed = true; } @@ -834,7 +861,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE); if (err) { if (err == -ENOENT) { - pg_start++; + pg_start = get_next_page_offset(&dn, pg_start); continue; } return err; @@ -1149,11 +1176,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - goto out; + goto out_unlock; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1165,7 +1195,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) f2fs_i_size_write(inode, new_size); - +out_unlock: + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); return ret; @@ -1348,6 +1379,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (ret) goto out; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->dio_rwsem[WRITE]); + truncate_pagecache(inode, offset); pg_start = offset >> PAGE_SHIFT; @@ -1375,6 +1409,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); + + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); out: up_write(&F2FS_I(inode)->i_mmap_sem); return ret; @@ -1424,8 +1460,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) - f2fs_i_size_write(inode, new_size); + if (new_size > i_size_read(inode)) { + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); + else + f2fs_i_size_write(inode, new_size); + } return err; } @@ -1436,6 +1476,9 @@ static long f2fs_fallocate(struct file *file, int mode, struct inode *inode = file_inode(file); long ret = 0; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + /* f2fs only support ->fallocate for regular file */ if (!S_ISREG(inode->i_mode)) return -EINVAL; @@ -1469,8 +1512,6 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); - if (mode & FALLOC_FL_KEEP_SIZE) - file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1864,6 +1905,9 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); + if (!f2fs_sb_has_crypto(inode->i_sb)) + return -EOPNOTSUPP; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); @@ -1871,6 +1915,8 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { + if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb)) + return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } @@ -2226,9 +2272,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); + down_write(&F2FS_I(src)->dio_rwsem[WRITE]); if (src != dst) { - if (!inode_trylock(dst)) { - ret = -EBUSY; + ret = -EBUSY; + if (!inode_trylock(dst)) + goto out; + if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) { + inode_unlock(dst); goto out; } } @@ -2288,9 +2338,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } f2fs_unlock_op(sbi); out_unlock: - if (src != dst) + if (src != dst) { + up_write(&F2FS_I(dst)->dio_rwsem[WRITE]); inode_unlock(dst); + } out: + up_write(&F2FS_I(src)->dio_rwsem[WRITE]); inode_unlock(src); return ret; } @@ -2412,6 +2465,9 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) + return -EIO; + switch (cmd) { case F2FS_IOC_GETFLAGS: return f2fs_ioc_getflags(filp, arg); @@ -2465,6 +2521,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) + return -EIO; + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { @@ -2475,6 +2534,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) err = f2fs_preallocate_blocks(iocb, from); if (err) { + clear_inode_flag(inode, FI_NO_PREALLOC); inode_unlock(inode); return err; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index bd16e6631cf3..be9fd616736b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -267,16 +267,6 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } -static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - unsigned int valid_blocks = - get_valid_blocks(sbi, segno, true); - - return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? - valid_blocks * 2 : valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -285,7 +275,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_greedy_cost(sbi, segno); + return get_valid_blocks(sbi, segno, true); else return get_cb_cost(sbi, segno); } @@ -466,10 +456,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi, struct seg_entry *sentry; int ret; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); sentry = get_seg_entry(sbi, segno); ret = f2fs_test_bit(offset, sentry->cur_valid_map); - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return ret; } @@ -608,6 +598,7 @@ static void move_data_block(struct inode *inode, block_t bidx, { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_READ, @@ -659,8 +650,8 @@ static void move_data_block(struct inode *inode, block_t bidx, allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, CURSEG_COLD_DATA, NULL, false); - fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, - FGP_LOCK | FGP_CREAT, GFP_NOFS); + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); if (!fio.encrypted_page) { err = -ENOMEM; goto recover_block; @@ -738,6 +729,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), + .ino = inode->i_ino, .type = DATA, .temp = COLD, .op = REQ_OP_WRITE, @@ -840,10 +832,17 @@ next_step: continue; } + if (!down_write_trylock( + &F2FS_I(inode)->dio_rwsem[WRITE])) { + iput(inode); + continue; + } + start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, start_bidx + ofs_in_node, REQ_RAHEAD, true); + up_write(&F2FS_I(inode)->dio_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); continue; @@ -901,10 +900,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, struct sit_info *sit_i = SIT_I(sbi); int ret; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, NO_CHECK_TYPE, LFS); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); return ret; } @@ -952,8 +951,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* * this is to avoid deadlock: * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - mutex_lock(sentry_lock) - * - mutex_lock(sentry_lock) - change_curseg() + * - check_valid_map() - down_write(sentry_lock) + * - down_read(sentry_lock) - change_curseg() * - lock_page(sum_page) */ if (type == SUM_TYPE_NODE) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index fbf22b0f667f..91d5d831be72 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -130,6 +130,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), + .ino = dn->inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 50c88e37ed66..9684d53563f1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -232,6 +232,23 @@ static int do_read_inode(struct inode *inode) fi->i_extra_isize = f2fs_has_extra_attr(inode) ? le16_to_cpu(ri->i_extra_isize) : 0; + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size); + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } else { + + /* + * Previous inline data or directory always reserved 200 bytes + * in inode layout, even if inline_xattr is disabled. In order + * to keep inline_dentry's structure for backward compatibility, + * we get the space back only from inline_data. + */ + fi->i_inline_xattr_size = 0; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -384,6 +401,10 @@ int update_inode(struct inode *inode, struct page *node_page) if (f2fs_has_extra_attr(inode)) { ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb)) + ri->i_inline_xattr_size = + cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size); + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, i_projid)) { @@ -480,6 +501,7 @@ void f2fs_evict_inode(struct inode *inode) remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + remove_ino_entry(sbi, inode->i_ino, FLUSH_INO); sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); @@ -519,8 +541,10 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + else + f2fs_inode_synced(inode); /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d92b8e9064cb..cf8f4370d256 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_t ino; struct inode *inode; bool nid_free = false; + int xattr_size = 0; int err; inode = new_inode(dir->i_sb); @@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(inode, FI_INLINE_DENTRY); + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) { + f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode)); + if (f2fs_has_inline_xattr(inode)) + xattr_size = sbi->inline_xattr_size; + /* Otherwise, will be 0 */ + } else if (f2fs_has_inline_xattr(inode) || + f2fs_has_inline_dentry(inode)) { + xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + } + F2FS_I(inode)->i_inline_xattr_size = xattr_size; + f2fs_init_extent_tree(inode, NULL); stat_inc_inline_xattr(inode); @@ -177,6 +190,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -221,6 +237,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (f2fs_encrypted_inode(dir) && !fscrypt_has_permitted_context(dir, inode)) return -EPERM; @@ -331,12 +350,15 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct f2fs_dir_entry *de; struct page *page; - nid_t ino; + struct dentry *new; + nid_t ino = -1; int err = 0; unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + trace_f2fs_lookup_start(dir, dentry, flags); + if (f2fs_encrypted_inode(dir)) { - int res = fscrypt_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); /* * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is @@ -346,18 +368,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (fscrypt_has_encryption_key(dir)) fscrypt_set_encrypted_dentry(dentry); fscrypt_set_d_op(dentry); - if (res && res != -ENOKEY) - return ERR_PTR(res); + if (err && err != -ENOKEY) + goto out; } - if (dentry->d_name.len > F2FS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > F2FS_NAME_LEN) { + err = -ENAMETOOLONG; + goto out; + } de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { - if (IS_ERR(page)) - return (struct dentry *)page; - return d_splice_alias(inode, dentry); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + goto out_splice; } ino = le32_to_cpu(de->ino); @@ -365,19 +391,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out; + } if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { err = __recover_dot_dentries(dir, root_ino); if (err) - goto err_out; + goto out_iput; } if (f2fs_has_inline_dots(inode)) { err = __recover_dot_dentries(inode, dir->i_ino); if (err) - goto err_out; + goto out_iput; } if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && @@ -386,12 +414,18 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, "Inconsistent encryption contexts: %lu/%lu", dir->i_ino, inode->i_ino); err = -EPERM; - goto err_out; + goto out_iput; } - return d_splice_alias(inode, dentry); - -err_out: +out_splice: + new = d_splice_alias(inode, dentry); + if (IS_ERR(new)) + err = PTR_ERR(new); + trace_f2fs_lookup_end(dir, dentry, ino, err); + return new; +out_iput: iput(inode); +out: + trace_f2fs_lookup_end(dir, dentry, ino, err); return ERR_PTR(err); } @@ -405,7 +439,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); + if (err) + return err; + err = dquot_initialize(inode); if (err) return err; @@ -457,6 +497,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct fscrypt_symlink_data *sd = NULL; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if (f2fs_encrypted_inode(dir)) { err = fscrypt_get_encryption_info(dir); if (err) @@ -563,6 +606,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -615,6 +661,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + err = dquot_initialize(dir); if (err) return err; @@ -709,6 +758,9 @@ out: static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + if (f2fs_encrypted_inode(dir)) { int err = fscrypt_get_encryption_info(dir); if (err) @@ -720,6 +772,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) { + if (unlikely(f2fs_cp_error(F2FS_I_SB(dir)))) + return -EIO; + return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); } @@ -739,6 +794,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if ((f2fs_encrypted_inode(old_dir) && !fscrypt_has_encryption_key(old_dir)) || (f2fs_encrypted_inode(new_dir) && @@ -764,6 +822,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; + if (new_inode) { + err = dquot_initialize(new_inode); + if (err) + goto out; + } + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -932,6 +996,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + if ((f2fs_encrypted_inode(old_dir) && !fscrypt_has_encryption_key(old_dir)) || (f2fs_encrypted_inode(new_dir) && diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 32474db18ad9..964c99655942 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,7 +46,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + mem_size = (nm_i->nid_cnt[FREE_NID] * sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { @@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) } else if (type == INO_ENTRIES) { int i; - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i < MAX_INO_ENTRY; i++) mem_size += sbi->im[i].ino_num * sizeof(struct ino_entry); mem_size >>= PAGE_SHIFT; @@ -74,6 +74,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INMEM_PAGES) { + /* it allows 20% / total_ram for inmemory pages */ + mem_size = get_pages(sbi, F2FS_INMEM_PAGES); + res = mem_size < (val.totalram / 5); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; @@ -134,6 +138,44 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } +static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +{ + struct nat_entry *new; + + if (no_fail) + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_NOFS | __GFP_ZERO); + else + new = kmem_cache_alloc(nat_entry_slab, + GFP_NOFS | __GFP_ZERO); + if (new) { + nat_set_nid(new, nid); + nat_reset_flag(new); + } + return new; +} + +static void __free_nat_entry(struct nat_entry *e) +{ + kmem_cache_free(nat_entry_slab, e); +} + +/* must be locked by nat_tree_lock */ +static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, + struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail) +{ + if (no_fail) + f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne); + else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne)) + return NULL; + + if (raw_ne) + node_info_from_raw_nat(&ne->ni, raw_ne); + list_add_tail(&ne->list, &nm_i->nat_entries); + nm_i->nat_cnt++; + return ne; +} + static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { return radix_tree_lookup(&nm_i->nat_root, n); @@ -150,7 +192,7 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) list_del(&e->list); radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); nm_i->nat_cnt--; - kmem_cache_free(nat_entry_slab, e); + __free_nat_entry(e); } static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, @@ -246,49 +288,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, - bool no_fail) -{ - struct nat_entry *new; - - if (no_fail) { - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); - } else { - new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - if (!new) - return NULL; - if (radix_tree_insert(&nm_i->nat_root, nid, new)) { - kmem_cache_free(nat_entry_slab, new); - return NULL; - } - } - - memset(new, 0, sizeof(struct nat_entry)); - nat_set_nid(new, nid); - nat_reset_flag(new); - list_add_tail(&new->list, &nm_i->nat_entries); - nm_i->nat_cnt++; - return new; -} - +/* must be locked by nat_tree_lock */ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry *ne) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct nat_entry *e; + struct nat_entry *new, *e; + new = __alloc_nat_entry(nid, false); + if (!new) + return; + + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); - if (!e) { - e = grab_nat_entry(nm_i, nid, false); - if (e) - node_info_from_raw_nat(&e->ni, ne); - } else { + if (!e) + e = __init_nat_entry(nm_i, new, ne, false); + else f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); - } + up_write(&nm_i->nat_tree_lock); + if (e != new) + __free_nat_entry(new); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -296,11 +318,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; + struct nat_entry *new = __alloc_nat_entry(ni->nid, true); down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid, true); + e = __init_nat_entry(nm_i, new, NULL, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -312,6 +335,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); } + /* let's free early to reduce memory consumption */ + if (e != new) + __free_nat_entry(new); /* sanity check */ f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); @@ -327,10 +353,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); - - /* in order to reuse the nid */ - if (nm_i->next_scan_nid > ni->nid) - nm_i->next_scan_nid = ni->nid; } /* change address */ @@ -424,9 +446,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) f2fs_put_page(page, 1); cache: /* cache nat entry */ - down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - up_write(&nm_i->nat_tree_lock); } /* @@ -962,7 +982,8 @@ fail: return err > 0 ? 0 : err; } -int truncate_xattr_node(struct inode *inode, struct page *page) +/* caller must lock inode page */ +int truncate_xattr_node(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; @@ -978,10 +999,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - set_new_dnode(&dn, inode, page, npage, nid); - - if (page) - dn.inode_page_locked = true; + set_new_dnode(&dn, inode, NULL, npage, nid); truncate_node(&dn); return 0; } @@ -1000,7 +1018,7 @@ int remove_inode_page(struct inode *inode) if (err) return err; - err = truncate_xattr_node(inode, dn.inode_page); + err = truncate_xattr_node(inode); if (err) { f2fs_put_dnode(&dn); return err; @@ -1220,7 +1238,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!inode) return; - page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); + page = f2fs_pagecache_get_page(inode->i_mapping, 0, + FGP_LOCK|FGP_NOWAIT, 0); if (!page) goto iput_out; @@ -1244,37 +1263,6 @@ iput_out: iput(inode); } -void move_node_page(struct page *node_page, int gc_type) -{ - if (gc_type == FG_GC) { - struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - .for_reclaim = 0, - }; - - set_page_dirty(node_page); - f2fs_wait_on_page_writeback(node_page, NODE, true); - - f2fs_bug_on(sbi, PageWriteback(node_page)); - if (!clear_page_dirty_for_io(node_page)) - goto out_page; - - if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) - unlock_page(node_page); - goto release_page; - } else { - /* set page dirty and write it */ - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } -out_page: - unlock_page(node_page); -release_page: - f2fs_put_page(node_page, 0); -} - static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; @@ -1344,6 +1332,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, + .ino = ino_of_node(page), .type = NODE, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), @@ -1416,6 +1405,37 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (__write_node_page(node_page, false, NULL, + &wbc, false, FS_GC_NODE_IO)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { @@ -1764,35 +1784,54 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static int __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list, bool new) +static int __insert_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state) { struct f2fs_nm_info *nm_i = NM_I(sbi); - if (new) { - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); - if (err) - return err; - } + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; - f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : - i->state != NID_ALLOC); - nm_i->nid_cnt[list]++; - list_add_tail(&i->list, &nm_i->nid_list[list]); + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]++; + if (state == FREE_NID) + list_add_tail(&i->list, &nm_i->free_nid_list); return 0; } -static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list, bool reuse) +static void __remove_free_nid(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_state state) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, state != i->state); + nm_i->nid_cnt[state]--; + if (state == FREE_NID) + list_del(&i->list); + radix_tree_delete(&nm_i->free_nid_root, i->nid); +} + +static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, + enum nid_state org_state, enum nid_state dst_state) { struct f2fs_nm_info *nm_i = NM_I(sbi); - f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : - i->state != NID_ALLOC); - nm_i->nid_cnt[list]--; - list_del(&i->list); - if (!reuse) - radix_tree_delete(&nm_i->free_nid_root, i->nid); + f2fs_bug_on(sbi, org_state != i->state); + i->state = dst_state; + nm_i->nid_cnt[org_state]--; + nm_i->nid_cnt[dst_state]++; + + switch (dst_state) { + case PREALLOC_NID: + list_del(&i->list); + break; + case FREE_NID: + list_add_tail(&i->list, &nm_i->free_nid_list); + break; + default: + BUG_ON(1); + } } /* return if the nid is recognized as free */ @@ -1810,7 +1849,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; - i->state = NID_NEW; + i->state = FREE_NID; if (radix_tree_preload(GFP_NOFS)) goto err; @@ -1823,7 +1862,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) * - f2fs_create * - f2fs_new_inode * - alloc_nid - * - __insert_nid_to_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(PREALLOC_NID) * - f2fs_balance_fs_bg * - build_free_nids * - __build_free_nids @@ -1836,8 +1875,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) * - new_node_page * - set_node_addr * - alloc_nid_done - * - __remove_nid_from_list(ALLOC_NID_LIST) - * - __insert_nid_to_list(FREE_NID_LIST) + * - __remove_nid_from_list(PREALLOC_NID) + * - __insert_nid_to_list(FREE_NID) */ ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || @@ -1846,13 +1885,13 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) e = __lookup_free_nid_list(nm_i, nid); if (e) { - if (e->state == NID_NEW) + if (e->state == FREE_NID) ret = true; goto err_out; } } ret = true; - err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + err = __insert_free_nid(sbi, i, FREE_NID); err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); @@ -1870,8 +1909,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + if (i && i->state == FREE_NID) { + __remove_free_nid(sbi, i, FREE_NID); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1890,15 +1929,18 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) return; - if (set) + if (set) { + if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - else - __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - - if (set) nm_i->free_nid_count[nat_ofs]++; - else if (!build) - nm_i->free_nid_count[nat_ofs]--; + } else { + if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs])) + return; + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + if (!build) + nm_i->free_nid_count[nat_ofs]--; + } } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1933,12 +1975,32 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +static void scan_curseg_cache(struct f2fs_sb_info *sbi) { - struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; + int i; + + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); +} + +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int i, idx; + nid_t nid; down_read(&nm_i->nat_tree_lock); @@ -1948,40 +2010,27 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi) if (!nm_i->free_nid_count[i]) continue; for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { - nid_t nid; - - if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) - continue; + idx = find_next_bit_le(nm_i->free_nid_bitmap[i], + NAT_ENTRY_PER_BLOCK, idx); + if (idx >= NAT_ENTRY_PER_BLOCK) + break; nid = i * NAT_ENTRY_PER_BLOCK + idx; add_free_nid(sbi, nid, true); - if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS) goto out; } } out: - down_read(&curseg->journal_rwsem); - for (i = 0; i < nats_in_cursum(journal); i++) { - block_t addr; - nid_t nid; + scan_curseg_cache(sbi); - addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); - nid = le32_to_cpu(nid_in_journal(journal, i)); - if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); - else - remove_free_nid(sbi, nid); - } - up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); } static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_journal *journal = curseg->journal; int i = 0; nid_t nid = nm_i->next_scan_nid; @@ -1989,7 +2038,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) nid = 0; /* Enough entries */ - if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; if (!sync && !available_free_memory(sbi, FREE_NIDS)) @@ -1999,7 +2048,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); - if (nm_i->nid_cnt[FREE_NID_LIST]) + if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) return; } @@ -2027,18 +2076,8 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - down_read(&curseg->journal_rwsem); - for (i = 0; i < nats_in_cursum(journal); i++) { - block_t addr; + scan_curseg_cache(sbi); - addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); - nid = le32_to_cpu(nid_in_journal(journal, i)); - if (addr == NULL_ADDR) - add_free_nid(sbi, nid, true); - else - remove_free_nid(sbi, nid); - } - up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), @@ -2076,15 +2115,13 @@ retry: } /* We should not use stale free nids created by build_free_nids */ - if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); - i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); + i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); - i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); nm_i->available_nids--; update_free_nid_bitmap(sbi, *nid, false, false); @@ -2110,7 +2147,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -2133,12 +2170,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) f2fs_bug_on(sbi, !i); if (!available_free_memory(sbi, FREE_NIDS)) { - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + __remove_free_nid(sbi, i, PREALLOC_NID); need_free = true; } else { - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); - i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); + __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID); } nm_i->available_nids++; @@ -2157,20 +2192,19 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], - list) { + list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { if (nr_shrink <= 0 || - nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) + nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + __remove_free_nid(sbi, i, FREE_NID); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2196,8 +2230,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page) goto update_inode; } - dst_addr = inline_xattr_addr(ipage); - src_addr = inline_xattr_addr(page); + dst_addr = inline_xattr_addr(inode, ipage); + src_addr = inline_xattr_addr(inode, page); inline_size = inline_xattr_size(inode); f2fs_wait_on_page_writeback(ipage, NODE, true); @@ -2286,6 +2320,12 @@ retry: dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); if (dst->i_inline & F2FS_EXTRA_ATTR) { dst->i_extra_isize = src->i_extra_isize; + + if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_inline_xattr_size)) + dst->i_inline_xattr_size = src->i_inline_xattr_size; + if (f2fs_sb_has_project_quota(sbi->sb) && F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) @@ -2357,8 +2397,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid, true); - node_info_from_raw_nat(&ne->ni, &raw_ne); + ne = __alloc_nat_entry(nid, true); + __init_nat_entry(nm_i, ne, &raw_ne, true); } /* @@ -2404,15 +2444,17 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; struct f2fs_nat_block *nat_blk = page_address(page); int valid = 0; - int i; + int i = 0; if (!enabled_nat_bits(sbi, NULL)) return; - for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { - if (start_nid == 0 && i == 0) - valid++; - if (nat_blk->entries[i].block_addr) + if (nat_index == 0) { + valid = 1; + i = 1; + } + for (; i < NAT_ENTRY_PER_BLOCK; i++) { + if (nat_blk->entries[i].block_addr != NULL_ADDR) valid++; } if (valid == 0) { @@ -2607,7 +2649,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) __set_bit_le(i, nm_i->nat_block_bitmap); nid = i * NAT_ENTRY_PER_BLOCK; - last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + last_nid = nid + NAT_ENTRY_PER_BLOCK; spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) @@ -2642,16 +2684,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - F2FS_RESERVED_NODE_NUM; - nm_i->nid_cnt[FREE_NID_LIST] = 0; - nm_i->nid_cnt[ALLOC_NID_LIST] = 0; + nm_i->nid_cnt[FREE_NID] = 0; + nm_i->nid_cnt[PREALLOC_NID] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); - INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); + INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); @@ -2743,16 +2784,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], - list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { + __remove_free_nid(sbi, i, FREE_NID); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); - f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); - f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]); + f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]); + f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list)); spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index bb53e9955ff2..0ee3e5ff49a3 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -140,6 +140,7 @@ enum mem_type { DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ + INMEM_PAGES, /* indicates inmemory pages */ BASE_CHECK, /* check kernel status */ }; @@ -150,18 +151,10 @@ struct nat_entry_set { unsigned int entry_cnt; /* the # of nat entries in set */ }; -/* - * For free nid mangement - */ -enum nid_state { - NID_NEW, /* newly added to free nid list */ - NID_ALLOC /* it is allocated */ -}; - struct free_nid { struct list_head list; /* for free node id list */ nid_t nid; /* node id */ - int state; /* in use or not: NID_NEW or NID_ALLOC */ + int state; /* in use or not: FREE_NID or PREALLOC_NID */ }; static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) @@ -170,12 +163,11 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *fnid; spin_lock(&nm_i->nid_list_lock); - if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + if (nm_i->nid_cnt[FREE_NID] <= 0) { spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], - struct free_nid, list); + fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9626758bc762..92c57ace1939 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -594,6 +594,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; +#ifdef CONFIG_QUOTA + int quota_enabled; +#endif if (s_flags & MS_RDONLY) { f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); @@ -604,7 +607,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= MS_ACTIVE; /* Turn on quotas so that they are updated correctly */ - f2fs_enable_quota_files(sbi); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); #endif fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -665,7 +668,8 @@ skip: out: #ifdef CONFIG_QUOTA /* Turn quotas off */ - f2fs_quota_off_umount(sbi->sb); + if (quota_enabled) + f2fs_quota_off_umount(sbi->sb); #endif sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f5c494389483..94939a5a96c8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -181,11 +181,12 @@ bool need_SSR(struct f2fs_sb_info *sbi) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } void register_inmem_page(struct inode *inode, struct page *page) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; @@ -204,6 +205,10 @@ void register_inmem_page(struct inode *inode, struct page *page) mutex_lock(&fi->inmem_lock); get_page(page); list_add_tail(&new->list, &fi->inmem_pages); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(&fi->inmem_ilist)) + list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); mutex_unlock(&fi->inmem_lock); @@ -262,12 +267,41 @@ next: return err; } +void drop_inmem_pages_all(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[ATOMIC_FILE]; + struct inode *inode; + struct f2fs_inode_info *fi; +next: + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + return; + } + fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + if (inode) { + drop_inmem_pages(inode); + iput(inode); + } + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto next; +} + void drop_inmem_pages(struct inode *inode) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); @@ -313,6 +347,7 @@ static int __commit_inmem_pages(struct inode *inode, struct inmem_pages *cur, *tmp; struct f2fs_io_info fio = { .sbi = sbi, + .ino = inode->i_ino, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, @@ -398,6 +433,10 @@ int commit_inmem_pages(struct inode *inode) /* drop all uncommitted pages */ __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_COMMIT); @@ -472,7 +511,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(0); + struct bio *bio = f2fs_bio_alloc(sbi, 0, true); int ret; bio->bi_rw = REQ_OP_WRITE; @@ -485,15 +524,17 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, return ret; } -static int submit_flush_wait(struct f2fs_sb_info *sbi) +static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) { - int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); + int ret = 0; int i; - if (!sbi->s_ndevs || ret) - return ret; + if (!sbi->s_ndevs) + return __submit_flush_wait(sbi, sbi->sb->s_bdev); - for (i = 1; i < sbi->s_ndevs; i++) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (!is_dirty_device(sbi, ino, i, FLUSH_INO)) + continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) break; @@ -519,7 +560,9 @@ repeat: fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - ret = submit_flush_wait(sbi); + cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); + + ret = submit_flush_wait(sbi, cmd->ino); atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, @@ -537,7 +580,7 @@ repeat: goto repeat; } -int f2fs_issue_flush(struct f2fs_sb_info *sbi) +int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; @@ -547,19 +590,20 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { - ret = submit_flush_wait(sbi); + ret = submit_flush_wait(sbi, ino); atomic_inc(&fcc->issued_flush); return ret; } - if (atomic_inc_return(&fcc->issing_flush) == 1) { - ret = submit_flush_wait(sbi); + if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) { + ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->issing_flush); atomic_inc(&fcc->issued_flush); return ret; } + cmd.ino = ino; init_completion(&cmd.wait); llist_add(&cmd.llnode, &fcc->issue_list); @@ -583,7 +627,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) } else { struct flush_cmd *tmp, *next; - ret = submit_flush_wait(sbi); + ret = submit_flush_wait(sbi, ino); llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { @@ -653,6 +697,28 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } } +int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) +{ + int ret = 0, i; + + if (!sbi->s_ndevs) + return 0; + + for (i = 1; i < sbi->s_ndevs; i++) { + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) + continue; + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; + + spin_lock(&sbi->dev_lock); + f2fs_clear_bit(i, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + return ret; +} + static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { @@ -794,6 +860,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) @@ -875,7 +943,7 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (ret) return ret; } - bio = f2fs_bio_alloc(1); + bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, 1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); @@ -926,10 +994,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, - struct discard_cmd *dc) + struct discard_policy *dpolicy, + struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); struct bio *bio = NULL; + int flag = dpolicy->sync ? REQ_SYNC : 0; if (dc->state != D_PREP) return; @@ -948,8 +1020,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, if (bio) { bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(REQ_SYNC, bio); - list_move_tail(&dc->list, &dcc->wait_list); + submit_bio(flag, bio); + list_move_tail(&dc->list, wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); f2fs_update_iostat(sbi, FS_DISCARD, 1); @@ -966,7 +1038,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, struct rb_node *insert_parent) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct rb_node **p = &dcc->root.rb_node; + struct rb_node **p; struct rb_node *parent = NULL; struct discard_cmd *dc = NULL; @@ -1134,58 +1206,107 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + unsigned int start, unsigned int end) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + int issued; + +next: + issued = 0; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, start, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc && dc->lstart <= end) { + struct rb_node *node; + + if (dc->len < dpolicy->granularity) + goto skip; + + if (dc->state != D_PREP) { + list_move_tail(&dc->list, &dcc->fstrim_list); + goto skip; + } + + __submit_discard_cmd(sbi, dpolicy, dc); + + if (++issued >= dpolicy->max_requests) { + start = dc->lstart + dc->len; + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + schedule(); + + goto next; + } +skip: + node = rb_next(&dc->rb_node); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + + if (fatal_signal_pending(current)) + break; + } + + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int iter = 0, issued = 0; - int i; + int i, iter = 0, issued = 0; bool io_interrupted = false; - mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, - !__check_rb_tree_consistence(sbi, &dcc->root)); - blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; - i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dpolicy->granularity) + break; pend_list = &dcc->pend_list[i]; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - /* Hurry up to finish fstrim */ - if (dcc->pend_list_tag[i] & P_TRIM) { - __submit_discard_cmd(sbi, dc); - issued++; - - if (fatal_signal_pending(current)) - break; - continue; - } - - if (!issue_cond) { - __submit_discard_cmd(sbi, dc); - issued++; - continue; - } - - if (is_idle(sbi)) { - __submit_discard_cmd(sbi, dc); - issued++; - } else { + if (dpolicy->io_aware && i < dpolicy->io_aware_gran && + !is_idle(sbi)) { io_interrupted = true; + goto skip; } - if (++iter >= DISCARD_ISSUE_RATE) - goto out; + __submit_discard_cmd(sbi, dpolicy, dc); + issued++; +skip: + if (++iter >= dpolicy->max_requests) + break; } - if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) - dcc->pend_list_tag[i] &= (~P_TRIM); + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + if (iter >= dpolicy->max_requests) + break; } -out: - blk_finish_plug(&plug); - mutex_unlock(&dcc->cmd_lock); if (!issued && io_interrupted) issued = -1; @@ -1193,12 +1314,13 @@ out: return issued; } -static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; int i; + bool dropped = false; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { @@ -1206,39 +1328,58 @@ static void __drop_discard_cmd(struct f2fs_sb_info *sbi) list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); __remove_discard_cmd(sbi, dc); + dropped = true; } } mutex_unlock(&dcc->cmd_lock); + + return dropped; } -static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, +static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned int len = 0; wait_for_completion_io(&dc->wait); mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, dc->state != D_DONE); dc->ref--; - if (!dc->ref) + if (!dc->ref) { + if (!dc->error) + len = dc->len; __remove_discard_cmd(sbi, dc); + } mutex_unlock(&dcc->cmd_lock); + + return len; } -static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy, + block_t start, block_t end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->wait_list); + struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? + &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc, *tmp; bool need_wait; + unsigned int trimmed = 0; next: need_wait = false; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(dc, tmp, wait_list, list) { - if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { + if (dc->lstart + dc->len <= start || end <= dc->lstart) + continue; + if (dc->len < dpolicy->granularity) + continue; + if (dc->state == D_DONE && !dc->ref) { wait_for_completion_io(&dc->wait); + if (!dc->error) + trimmed += dc->len; __remove_discard_cmd(sbi, dc); } else { dc->ref++; @@ -1249,9 +1390,17 @@ next: mutex_unlock(&dcc->cmd_lock); if (need_wait) { - __wait_one_discard_bio(sbi, dc); + trimmed += __wait_one_discard_bio(sbi, dc); goto next; } + + return trimmed; +} + +static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1289,23 +1438,19 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) -{ - __issue_discard_cmd(sbi, false); - __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, !umount); -} - -static void mark_discard_range_all(struct f2fs_sb_info *sbi) +/* This comes from f2fs_put_super */ +bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - int i; + struct discard_policy dpolicy; + bool dropped; - mutex_lock(&dcc->cmd_lock); - for (i = 0; i < MAX_PLIST_NUM; i++) - dcc->pend_list_tag[i] |= P_TRIM; - mutex_unlock(&dcc->cmd_lock); + init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + __issue_discard_cmd(sbi, &dpolicy); + dropped = __drop_discard_cmd(sbi); + __wait_all_discard_cmd(sbi, &dpolicy); + + return dropped; } static int issue_discard_thread(void *data) @@ -1313,12 +1458,16 @@ static int issue_discard_thread(void *data) struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + struct discard_policy dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; int issued; set_freezable(); do { + init_discard_policy(&dpolicy, DPOLICY_BG, + dcc->discard_granularity); + wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || dcc->discard_wake, @@ -1331,17 +1480,18 @@ static int issue_discard_thread(void *data) if (dcc->discard_wake) { dcc->discard_wake = 0; if (sbi->gc_thread && sbi->gc_thread->gc_urgent) - mark_discard_range_all(sbi); + init_discard_policy(&dpolicy, + DPOLICY_FORCE, 1); } sb_start_intwrite(sbi->sb); - issued = __issue_discard_cmd(sbi, true); + issued = __issue_discard_cmd(sbi, &dpolicy); if (issued) { - __wait_discard_cmd(sbi, true); - wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + __wait_all_discard_cmd(sbi, &dpolicy); + wait_ms = dpolicy.min_interval; } else { - wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + wait_ms = dpolicy.max_interval; } sb_end_intwrite(sbi->sb); @@ -1605,7 +1755,6 @@ find_next: f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, len); - cpc->trimmed += len; total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, @@ -1626,6 +1775,37 @@ skip: wake_up_discard_thread(sbi, false); } +void init_discard_policy(struct discard_policy *dpolicy, + int discard_type, unsigned int granularity) +{ + /* common policy */ + dpolicy->type = discard_type; + dpolicy->sync = true; + dpolicy->granularity = granularity; + + if (discard_type == DPOLICY_BG) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + } else if (discard_type == DPOLICY_FORCE) { + dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; + dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = true; + } else if (discard_type == DPOLICY_FSTRIM) { + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = false; + } else if (discard_type == DPOLICY_UMOUNT) { + dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; + dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->io_aware = false; + } +} + static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; @@ -1643,12 +1823,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) { + for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); - if (i >= dcc->discard_granularity - 1) - dcc->pend_list_tag[i] |= P_ACTIVE; - } INIT_LIST_HEAD(&dcc->wait_list); + INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->issing_discard, 0); @@ -1796,16 +1974,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) get_sec_entry(sbi, segno)->valid_blocks += del; } -void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new) -{ - update_sit_entry(sbi, new, 1); - if (GET_SEGNO(sbi, old) != NULL_SEGNO) - update_sit_entry(sbi, old, -1); - - locate_dirty_segment(sbi, GET_SEGNO(sbi, old)); - locate_dirty_segment(sbi, GET_SEGNO(sbi, new)); -} - void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) { unsigned int segno = GET_SEGNO(sbi, addr); @@ -1816,14 +1984,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) return; /* add it into sit main buffer */ - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); update_sit_entry(sbi, addr, -1); /* add it into dirty seglist */ locate_dirty_segment(sbi, segno); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); } bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) @@ -1836,7 +2004,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return true; - mutex_lock(&sit_i->sentry_lock); + down_read(&sit_i->sentry_lock); segno = GET_SEGNO(sbi, blkaddr); se = get_seg_entry(sbi, segno); @@ -1845,7 +2013,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) if (f2fs_test_bit(offset, se->ckpt_valid_map)) is_cp = true; - mutex_unlock(&sit_i->sentry_lock); + up_read(&sit_i->sentry_lock); return is_cp; } @@ -1903,12 +2071,8 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct page *page = grab_meta_page(sbi, blk_addr); - void *dst = page_address(page); - if (src) - memcpy(dst, src, PAGE_SIZE); - else - memset(dst, 0, PAGE_SIZE); + memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } @@ -2007,7 +2171,6 @@ find_other_zone: } secno = left_start; skip_left: - hint = secno; segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); @@ -2242,12 +2405,16 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; + down_write(&SIT_I(sbi)->sentry_lock); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); locate_dirty_segment(sbi, old_segno); } + + up_write(&SIT_I(sbi)->sentry_lock); } static const struct segment_allocation default_salloc_ops = { @@ -2259,14 +2426,14 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u64 trim_start = cpc->trim_start; bool has_candidate = false; - mutex_lock(&SIT_I(sbi)->sentry_lock); + down_write(&SIT_I(sbi)->sentry_lock); for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { if (add_discard_addrs(sbi, cpc, true)) { has_candidate = true; break; } } - mutex_unlock(&SIT_I(sbi)->sentry_lock); + up_write(&SIT_I(sbi)->sentry_lock); cpc->trim_start = trim_start; return has_candidate; @@ -2276,14 +2443,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; - unsigned int start_segno, end_segno; + unsigned int start_segno, end_segno, cur_segno; + block_t start_block, end_block; struct cp_control cpc; + struct discard_policy dpolicy; + unsigned long long trimmed = 0; int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; - cpc.trimmed = 0; if (end <= MAIN_BLKADDR(sbi)) goto out; @@ -2297,12 +2466,14 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); /* do checkpoint to issue discard commands safely */ - for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { - cpc.trim_start = start_segno; + for (cur_segno = start_segno; cur_segno <= end_segno; + cur_segno = cpc.trim_end + 1) { + cpc.trim_start = cur_segno; if (sbi->discard_blks == 0) break; @@ -2310,7 +2481,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) cpc.trim_end = end_segno; else cpc.trim_end = min_t(unsigned int, - rounddown(start_segno + + rounddown(cur_segno + BATCHED_TRIM_SEGMENTS(sbi), sbi->segs_per_sec) - 1, end_segno); @@ -2322,11 +2493,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } - /* It's time to issue all the filed discards */ - mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi, false); + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1); + + init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); + trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); out: - range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); + range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } @@ -2338,6 +2514,20 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } +#if 0 +int rw_hint_to_seg_type(enum rw_hint hint) +{ + switch (hint) { + case WRITE_LIFE_SHORT: + return CURSEG_HOT_DATA; + case WRITE_LIFE_EXTREME: + return CURSEG_COLD_DATA; + default: + return CURSEG_WARM_DATA; + } +} +#endif + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2372,6 +2562,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (is_inode_flag_set(inode, FI_HOT_DATA)) return CURSEG_HOT_DATA; + /* rw_hint_to_seg_type(inode->i_write_hint); */ return CURSEG_WARM_DATA; } else { if (IS_DNODE(fio->page)) @@ -2416,8 +2607,10 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); + down_read(&SM_I(sbi)->curseg_lock); + mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -2434,15 +2627,26 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + /* + * SIT information should be updated before segment allocation, + * since SSR needs latest valid block information. + */ + update_sit_entry(sbi, *new_blkaddr, 1); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + update_sit_entry(sbi, old_blkaddr, -1); + if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); + /* - * SIT information should be updated after segment allocation, - * since we need to keep dirty segments precisely under SSR. + * segment dirty status should be updated after segment allocation, + * so we just need to update status only one time after previous + * segment being closed. */ - refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); + locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); + locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); @@ -2462,6 +2666,29 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, } mutex_unlock(&curseg->curseg_mutex); + + up_read(&SM_I(sbi)->curseg_lock); +} + +static void update_device_state(struct f2fs_io_info *fio) +{ + struct f2fs_sb_info *sbi = fio->sbi; + unsigned int devidx; + + if (!sbi->s_ndevs) + return; + + devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + + /* update device state for fsync */ + set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) @@ -2478,6 +2705,8 @@ reallocate: if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; + } else if (!err) { + update_device_state(fio); } } @@ -2538,12 +2767,26 @@ int rewrite_data_page(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); + if (!err) + update_device_state(fio); f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); return err; } +static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + int i; + + for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { + if (CURSEG_I(sbi, i)->segno == segno) + break; + } + return i; +} + void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr) @@ -2559,6 +2802,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, se = get_seg_entry(sbi, segno); type = se->type; + down_write(&SM_I(sbi)->curseg_lock); + if (!recover_curseg) { /* for recovery flow */ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { @@ -2568,14 +2813,19 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, type = CURSEG_WARM_DATA; } } else { - if (!IS_CURSEG(sbi, segno)) + if (IS_CURSEG(sbi, segno)) { + /* se->type is volatile as SSR allocation */ + type = __f2fs_get_curseg(sbi, segno); + f2fs_bug_on(sbi, type == NO_CHECK_TYPE); + } else { type = CURSEG_WARM_DATA; + } } curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; @@ -2607,8 +2857,9 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, curseg->next_blkoff = old_blkoff; } - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); + up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, @@ -3062,7 +3313,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool to_journal = true; struct seg_entry *se; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) goto out; @@ -3156,7 +3407,7 @@ out: cpc->trim_start = trim_start; } - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } @@ -3249,7 +3500,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; - mutex_init(&sit_i->sentry_lock); + init_rwsem(&sit_i->sentry_lock); return 0; } @@ -3490,7 +3741,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) struct sit_info *sit_i = SIT_I(sbi); unsigned int segno; - mutex_lock(&sit_i->sentry_lock); + down_write(&sit_i->sentry_lock); sit_i->min_mtime = LLONG_MAX; @@ -3507,7 +3758,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi); - mutex_unlock(&sit_i->sentry_lock); + up_write(&sit_i->sentry_lock); } int build_segment_manager(struct f2fs_sb_info *sbi) @@ -3540,11 +3791,14 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; + sm_info->min_ssr_sections = reserved_sections(sbi); sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; INIT_LIST_HEAD(&sm_info->sit_entry_set); + init_rwsem(&sm_info->curseg_lock); + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ffa11274b0ce..5264b6ed120c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -231,7 +231,7 @@ struct sit_info { unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ - struct mutex sentry_lock; /* to protect SIT cache */ + struct rw_semaphore sentry_lock; /* to protect SIT cache */ struct seg_entry *sentries; /* SIT segment-level cache */ struct sec_entry *sec_entries; /* SIT section-level cache */ @@ -497,6 +497,33 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } +static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi) +{ + unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) + + get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int segno, left_blocks; + int i; + + /* check current node segment */ + for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + segno = CURSEG_I(sbi, i)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + if (node_blocks > left_blocks) + return false; + } + + /* check current data segment */ + segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + left_blocks = sbi->blocks_per_seg - + get_seg_entry(sbi, segno)->ckpt_valid_blocks; + if (dent_blocks > left_blocks) + return false; + return true; +} + static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { @@ -507,6 +534,9 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; + if (free_sections(sbi) + freed == reserved_sections(sbi) + needed && + has_curseg_enough_space(sbi)) + return false; return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + needed); @@ -730,7 +760,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) > sbi->fggc_threshold) return true; return false; @@ -795,8 +825,9 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) goto wake_up; mutex_lock(&dcc->cmd_lock); - for (i = MAX_PLIST_NUM - 1; - i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (i + 1 < dcc->discard_granularity) + break; if (!list_empty(&dcc->pend_list[i])) { wakeup = true; break; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 5c60fc28ec75..0b5664a1a6cc 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -28,7 +28,7 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS; return count > 0 ? count : 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 482bb0333806..76e2f1518224 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -44,6 +44,8 @@ static struct kmem_cache *f2fs_inode_cachep; char *fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_BIO] = "alloc bio", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", @@ -92,6 +94,7 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_noinline_xattr, + Opt_inline_xattr_size, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -141,6 +144,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_noinline_xattr, "noinline_xattr"}, + {Opt_inline_xattr_size, "inline_xattr_size=%u"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -209,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, "quota options when quota turned on"); return -EINVAL; } + if (f2fs_sb_has_quota_ino(sb)) { + f2fs_msg(sb, KERN_INFO, + "QUOTA feature is enabled, so ignore qf_name"); + return 0; + } + qname = match_strdup(args); if (!qname) { f2fs_msg(sb, KERN_ERR, @@ -287,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) return -1; } } + + if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_INFO, + "QUOTA feature is enabled, so ignore jquota_fmt"); + sbi->s_jquota_fmt = 0; + } + if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return -1; + } return 0; } #endif @@ -383,6 +405,12 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_xattr: clear_opt(sbi, INLINE_XATTR); break; + case Opt_inline_xattr_size: + if (args->from && match_int(args, &arg)) + return -EINVAL; + set_opt(sbi, INLINE_XATTR_SIZE); + sbi->inline_xattr_size = arg; + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -604,6 +632,24 @@ static int parse_options(struct super_block *sb, char *options) F2FS_IO_SIZE_KB(sbi)); return -EINVAL; } + + if (test_opt(sbi, INLINE_XATTR_SIZE)) { + if (!test_opt(sbi, INLINE_XATTR)) { + f2fs_msg(sb, KERN_ERR, + "inline_xattr_size option should be " + "set with inline_xattr option"); + return -EINVAL; + } + if (!sbi->inline_xattr_size || + sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE - + DEF_INLINE_RESERVED_SIZE - + DEF_MIN_INLINE_SIZE) { + f2fs_msg(sb, KERN_ERR, + "inline xattr size is out of range"); + return -EINVAL; + } + } return 0; } @@ -618,13 +664,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); /* Initialize f2fs-specific inode info */ - fi->vfs_inode.i_version = 1; atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->inmem_ilist); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); @@ -673,7 +719,6 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); - fscrypt_put_encryption_info(inode, NULL); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } @@ -781,6 +826,7 @@ static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); int i; + bool dropped; f2fs_quota_off_umount(sb); @@ -801,9 +847,9 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi, true); + dropped = f2fs_wait_discard_bios(sbi); - if (f2fs_discard_en(sbi) && !sbi->discard_blks) { + if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -859,6 +905,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) struct f2fs_sb_info *sbi = F2FS_SB(sb); int err = 0; + if (unlikely(f2fs_cp_error(sbi))) + return 0; + trace_f2fs_sync_fs(sb, sync); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -958,7 +1007,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi) - - sbi->reserved_blocks; + sbi->current_reserved_blocks; avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -1047,6 +1096,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",inline_xattr"); else seq_puts(seq, ",noinline_xattr"); + if (test_opt(sbi, INLINE_XATTR_SIZE)) + seq_printf(seq, ",inline_xattr_size=%u", + sbi->inline_xattr_size); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -1109,6 +1161,7 @@ static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; + sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1137,6 +1190,9 @@ static void default_options(struct f2fs_sb_info *sbi) #endif } +#ifdef CONFIG_QUOTA +static int f2fs_enable_quotas(struct super_block *sb); +#endif static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -1203,6 +1259,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; +#ifdef CONFIG_QUOTA if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) @@ -1210,9 +1267,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } else { /* dquot_resume needs RW */ sb->s_flags &= ~MS_RDONLY; - dquot_resume(sb, -1); + if (sb_any_quota_suspended(sb)) { + dquot_resume(sb, -1); + } else if (f2fs_sb_has_quota_ino(sb)) { + err = f2fs_enable_quotas(sb); + if (err) + goto restore_opts; + } } - +#endif /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1321,8 +1384,13 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); repeat: page = read_mapping_page(mapping, blkidx, NULL); - if (IS_ERR(page)) + if (IS_ERR(page)) { + if (PTR_ERR(page) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto repeat; + } return PTR_ERR(page); + } lock_page(page); @@ -1365,11 +1433,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, while (towrite > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, towrite); - +retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, &page, NULL); - if (unlikely(err)) + if (unlikely(err)) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } break; + } kaddr = kmap_atomic(page); memcpy(kaddr + offset, data, tocopy); @@ -1386,8 +1459,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, } if (len == towrite) - return 0; - inode->i_version++; + return err; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; @@ -1409,19 +1481,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) sbi->s_jquota_fmt, type); } -void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly) { - int i, ret; + int enabled = 0; + int i, err; + + if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) { + err = f2fs_enable_quotas(sbi->sb); + if (err) { + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quota_ino: %d", err); + return 0; + } + return 1; + } for (i = 0; i < MAXQUOTAS; i++) { if (sbi->s_qf_names[i]) { - ret = f2fs_quota_on_mount(sbi, i); - if (ret < 0) - f2fs_msg(sbi->sb, KERN_ERR, - "Cannot turn on journaled " - "quota: error %d", ret); + err = f2fs_quota_on_mount(sbi, i); + if (!err) { + enabled = 1; + continue; + } + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on quotas: %d on %d", err, i); + } + } + return enabled; +} + +static int f2fs_quota_enable(struct super_block *sb, int type, int format_id, + unsigned int flags) +{ + struct inode *qf_inode; + unsigned long qf_inum; + int err; + + BUG_ON(!f2fs_sb_has_quota_ino(sb)); + + qf_inum = f2fs_qf_ino(sb, type); + if (!qf_inum) + return -EPERM; + + qf_inode = f2fs_iget(sb, qf_inum); + if (IS_ERR(qf_inode)) { + f2fs_msg(sb, KERN_ERR, + "Bad quota inode %u:%lu", type, qf_inum); + return PTR_ERR(qf_inode); + } + + /* Don't account quota for quota files to avoid recursion */ + qf_inode->i_flags |= S_NOQUOTA; + err = dquot_enable(qf_inode, type, format_id, flags); + iput(qf_inode); + return err; +} + +static int f2fs_enable_quotas(struct super_block *sb) +{ + int type, err = 0; + unsigned long qf_inum; + bool quota_mopt[MAXQUOTAS] = { + test_opt(F2FS_SB(sb), USRQUOTA), + test_opt(F2FS_SB(sb), GRPQUOTA), + test_opt(F2FS_SB(sb), PRJQUOTA), + }; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + for (type = 0; type < MAXQUOTAS; type++) { + qf_inum = f2fs_qf_ino(sb, type); + if (qf_inum) { + err = f2fs_quota_enable(sb, type, QFMT_VFS_V1, + DQUOT_USAGE_ENABLED | + (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "fsck to fix.", type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; + } } } + return 0; } static int f2fs_quota_sync(struct super_block *sb, int type) @@ -1492,7 +1636,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); - if (err) + if (err || f2fs_sb_has_quota_ino(sb)) goto out_put; inode_lock(inode); @@ -1660,7 +1804,7 @@ static loff_t max_file_blocks(void) /* * note: previously, result is equal to (DEF_ADDRS_PER_INODE - - * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more * space in inode.i_addr, it will be more safe to reassign * result as zero. */ @@ -1969,6 +2113,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (j = HOT; j < NR_TEMP_TYPE; j++) mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); + + sbi->dirty_device = 0; + spin_lock_init(&sbi->dev_lock); } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -2323,7 +2470,10 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; - sb->s_qcop = &f2fs_quotactl_ops; + if (f2fs_sb_has_quota_ino(sb)) + sb->s_qcop = &dquot_quotactl_sysfile_ops; + else + sb->s_qcop = &f2fs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif @@ -2419,6 +2569,7 @@ try_onemore: le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->reserved_blocks = 0; + sbi->current_reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -2493,10 +2644,24 @@ try_onemore: if (err) goto free_root_inode; +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) { + err = f2fs_enable_quotas(sb); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", err); + goto free_sysfs; + } + } +#endif /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) - goto free_sysfs; + goto free_meta; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2530,7 +2695,7 @@ try_onemore: err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_sysfs; + goto free_meta; } } skip_recovery: @@ -2564,6 +2729,10 @@ skip_recovery: return 0; free_meta: +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) + f2fs_quota_off_umount(sbi->sb); +#endif f2fs_sync_inode_meta(sbi); /* * Some dirty meta pages can be produced by recover_orphan_inodes() @@ -2572,7 +2741,9 @@ free_meta: * falls into an infinite loop in sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); +#ifdef CONFIG_QUOTA free_sysfs: +#endif f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e2c258f717cd..9835348b6e5d 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -30,7 +30,7 @@ enum { FAULT_INFO_RATE, /* struct f2fs_fault_info */ FAULT_INFO_TYPE, /* struct f2fs_fault_info */ #endif - RESERVED_BLOCKS, + RESERVED_BLOCKS, /* struct f2fs_sb_info */ }; struct f2fs_attr { @@ -63,6 +63,13 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return NULL; } +static ssize_t dirty_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(dirty_segments(sbi))); +} + static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -100,10 +107,22 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_inode_chksum(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); + if (f2fs_sb_has_flexible_inline_xattr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "flexible_inline_xattr"); + if (f2fs_sb_has_quota_ino(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "quota_ino"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } +static ssize_t current_reserved_blocks_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -143,34 +162,22 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); - if ((unsigned long)sbi->total_valid_block_count + t > - (unsigned long)sbi->user_block_count) { + if (t > (unsigned long)sbi->user_block_count) { spin_unlock(&sbi->stat_lock); return -EINVAL; } *ui = t; + sbi->current_reserved_blocks = min(sbi->reserved_blocks, + sbi->user_block_count - valid_user_blocks(sbi)); spin_unlock(&sbi->stat_lock); return count; } if (!strcmp(a->attr.name, "discard_granularity")) { - struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - int i; - if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; if (t == *ui) return count; - - mutex_lock(&dcc->cmd_lock); - for (i = 0; i < MAX_PLIST_NUM; i++) { - if (i >= t - 1) - dcc->pend_list_tag[i] |= P_ACTIVE; - else - dcc->pend_list_tag[i] &= (~P_ACTIVE); - } - mutex_unlock(&dcc->cmd_lock); - *ui = t; return count; } @@ -222,6 +229,8 @@ enum feat_id { FEAT_EXTRA_ATTR, FEAT_PROJECT_QUOTA, FEAT_INODE_CHECKSUM, + FEAT_FLEXIBLE_INLINE_XATTR, + FEAT_QUOTA_INO, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -234,6 +243,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_EXTRA_ATTR: case FEAT_PROJECT_QUOTA: case FEAT_INODE_CHECKSUM: + case FEAT_FLEXIBLE_INLINE_XATTR: + case FEAT_QUOTA_INO: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -279,6 +290,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -291,8 +303,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif +F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); +F2FS_GENERAL_RO_ATTR(current_reserved_blocks); #ifdef CONFIG_F2FS_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); @@ -304,6 +318,8 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); +F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); +F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -321,6 +337,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), ATTR_LIST(min_hot_blocks), + ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), @@ -333,9 +350,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif + ATTR_LIST(dirty_segments), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), ATTR_LIST(reserved_blocks), + ATTR_LIST(current_reserved_blocks), NULL, }; @@ -350,6 +369,8 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(extra_attr), ATTR_LIST(project_quota), ATTR_LIST(inode_checksum), + ATTR_LIST(flexible_inline_xattr), + ATTR_LIST(quota_ino), NULL, }; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index ab658419552b..7acf56ebda65 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -264,12 +264,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, - void **last_addr, int index, - size_t len, const char *name) +static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, + void *base_addr, void **last_addr, int index, + size_t len, const char *name) { struct f2fs_xattr_entry *entry; - unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + unsigned int inline_size = inline_xattr_size(inode); list_for_each_xattr(entry, base_addr) { if ((void *)entry + sizeof(__u32) > base_addr + inline_size || @@ -288,12 +288,54 @@ static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, return entry; } +static int read_inline_xattr(struct inode *inode, struct page *ipage, + void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int inline_size = inline_xattr_size(inode); + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(inode, ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) + return PTR_ERR(page); + + inline_addr = inline_xattr_addr(inode, page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + return 0; +} + +static int read_xattr_block(struct inode *inode, void *txattr_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int inline_size = inline_xattr_size(inode); + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) + return PTR_ERR(xpage); + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE); + f2fs_put_page(xpage, 1); + + return 0; +} + static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, void **base_addr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; @@ -310,23 +352,11 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto out; - } - inline_addr = inline_xattr_addr(page); - } - memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto out; - *xe = __find_inline_xattr(txattr_addr, &last_addr, + *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, index, len, name); if (*xe) goto check; @@ -334,19 +364,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from xattr node block */ if (xnid) { - struct page *xpage; - void *xattr_addr; - - /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + err = read_xattr_block(inode, txattr_addr); + if (err) goto out; - } - - xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, size); - f2fs_put_page(xpage, 1); } if (last_addr) @@ -371,7 +391,6 @@ out: static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = VALID_XATTR_BLOCK_SIZE; @@ -386,38 +405,16 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; - } - inline_addr = inline_xattr_addr(page); - } - memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + err = read_inline_xattr(inode, ipage, txattr_addr); + if (err) + goto fail; } /* read from xattr node block */ if (xnid) { - struct page *xpage; - void *xattr_addr; - - /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, xnid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + err = read_xattr_block(inode, txattr_addr); + if (err) goto fail; - } - - xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, size); - f2fs_put_page(xpage, 1); } header = XATTR_HDR(txattr_addr); @@ -439,10 +436,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t inline_size = inline_xattr_size(inode); + struct page *in_page = NULL; void *xattr_addr; + void *inline_addr = NULL; struct page *xpage; nid_t new_nid = 0; - int err; + int err = 0; if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) @@ -450,30 +449,30 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to inline xattr */ if (inline_size) { - struct page *page = NULL; - void *inline_addr; - if (ipage) { - inline_addr = inline_xattr_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true); - set_page_dirty(ipage); + inline_addr = inline_xattr_addr(inode, ipage); } else { - page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { + in_page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(in_page)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(page); + return PTR_ERR(in_page); } - inline_addr = inline_xattr_addr(page); - f2fs_wait_on_page_writeback(page, NODE, true); + inline_addr = inline_xattr_addr(inode, in_page); } - memcpy(inline_addr, txattr_addr, inline_size); - f2fs_put_page(page, 1); + f2fs_wait_on_page_writeback(ipage ? ipage : in_page, + NODE, true); /* no need to use xattr node block */ if (hsize <= inline_size) { - err = truncate_xattr_node(inode, ipage); + err = truncate_xattr_node(inode); alloc_nid_failed(sbi, new_nid); - return err; + if (err) { + f2fs_put_page(in_page, 1); + return err; + } + memcpy(inline_addr, txattr_addr, inline_size); + set_page_dirty(ipage ? ipage : in_page); + goto in_page_out; } } @@ -482,7 +481,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(xpage); + goto in_page_out; } f2fs_bug_on(sbi, new_nid); f2fs_wait_on_page_writeback(xpage, NODE, true); @@ -492,17 +491,24 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); - return PTR_ERR(xpage); + goto in_page_out; } alloc_nid_done(sbi, new_nid); } - xattr_addr = page_address(xpage); + + if (inline_size) + memcpy(inline_addr, txattr_addr, inline_size); memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); + + if (inline_size) + set_page_dirty(ipage ? ipage : in_page); set_page_dirty(xpage); - f2fs_put_page(xpage, 1); - return 0; + f2fs_put_page(xpage, 1); +in_page_out: + f2fs_put_page(in_page, 1); + return err; } int f2fs_getxattr(struct inode *inode, int index, const char *name, @@ -721,6 +727,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; + err = dquot_initialize(inode); + if (err) + return err; + /* this case is only from init_inode_metadata */ if (ipage) return __f2fs_setxattr(inode, index, name, value, diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c2a975e4a711..fef1caeddf54 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,6 +36,8 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_MAX_QUOTAS 3 + #define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ #define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ #define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ @@ -108,7 +110,8 @@ struct f2fs_super_block { __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ struct f2fs_device devs[MAX_DEVICES]; /* device list */ - __u8 reserved[327]; /* valid reserved region */ + __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */ + __u8 reserved[315]; /* valid reserved region */ } __packed; /* @@ -184,7 +187,8 @@ struct f2fs_extent { } __packed; #define F2FS_NAME_LEN 255 -#define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ +/* 200 bytes for inline xattrs by default */ +#define DEFAULT_INLINE_XATTR_ADDRS 50 #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ #define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ get_extra_isize(inode)) @@ -238,7 +242,7 @@ struct f2fs_inode { union { struct { __le16 i_extra_isize; /* extra inode attribute size */ - __le16 i_padding; /* padding */ + __le16 i_inline_xattr_size; /* inline xattr size, unit: 4 bytes */ __le32 i_projid; /* project id */ __le32 i_inode_checksum;/* inode meta checksum */ __le32 i_extra_end[0]; /* for attribute size calculation */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7063bbcca03b..589df6f73789 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -128,6 +128,18 @@ TRACE_DEFINE_ENUM(CP_TRIMMED); { CP_DISCARD, "Discard" }, \ { CP_UMOUNT | CP_TRIMMED, "Umount,Trimmed" }) +#define show_fsync_cpreason(type) \ + __print_symbolic(type, \ + { CP_NO_NEEDED, "no needed" }, \ + { CP_NON_REGULAR, "non regular" }, \ + { CP_HARDLINK, "hardlink" }, \ + { CP_SB_NEED_CP, "sb needs cp" }, \ + { CP_WRONG_PINO, "wrong pino" }, \ + { CP_NO_SPC_ROLL, "no space roll forward" }, \ + { CP_NODE_NEED_CP, "node needs cp" }, \ + { CP_FASTBOOT_MODE, "fastboot mode" }, \ + { CP_SPEC_LOG_NUM, "log type is 2" }) + struct victim_sel_policy; struct f2fs_map_blocks; @@ -202,14 +214,14 @@ DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter, TRACE_EVENT(f2fs_sync_file_exit, - TP_PROTO(struct inode *inode, int need_cp, int datasync, int ret), + TP_PROTO(struct inode *inode, int cp_reason, int datasync, int ret), - TP_ARGS(inode, need_cp, datasync, ret), + TP_ARGS(inode, cp_reason, datasync, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(int, need_cp) + __field(int, cp_reason) __field(int, datasync) __field(int, ret) ), @@ -217,15 +229,15 @@ TRACE_EVENT(f2fs_sync_file_exit, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->need_cp = need_cp; + __entry->cp_reason = cp_reason; __entry->datasync = datasync; __entry->ret = ret; ), - TP_printk("dev = (%d,%d), ino = %lu, checkpoint is %s, " + TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, " "datasync = %d, ret = %d", show_dev_ino(__entry), - __entry->need_cp ? "needed" : "not needed", + show_fsync_cpreason(__entry->cp_reason), __entry->datasync, __entry->ret) ); @@ -716,6 +728,91 @@ TRACE_EVENT(f2fs_get_victim, __entry->free) ); +TRACE_EVENT(f2fs_lookup_start, + + TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags), + + TP_ARGS(dir, dentry, flags), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(const char *, name) + __field(unsigned int, flags) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->name = dentry->d_name.name; + __entry->flags = flags; + ), + + TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", + show_dev_ino(__entry), + __entry->name, + __entry->flags) +); + +TRACE_EVENT(f2fs_lookup_end, + + TP_PROTO(struct inode *dir, struct dentry *dentry, nid_t ino, + int err), + + TP_ARGS(dir, dentry, ino, err), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(const char *, name) + __field(nid_t, cino) + __field(int, err) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->name = dentry->d_name.name; + __entry->cino = ino; + __entry->err = err; + ), + + TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", + show_dev_ino(__entry), + __entry->name, + __entry->cino, + __entry->err) +); + +TRACE_EVENT(f2fs_readdir, + + TP_PROTO(struct inode *dir, loff_t start_pos, loff_t end_pos, int err), + + TP_ARGS(dir, start_pos, end_pos, err), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, start) + __field(loff_t, end) + __field(int, err) + ), + + TP_fast_assign( + __entry->dev = dir->i_sb->s_dev; + __entry->ino = dir->i_ino; + __entry->start = start_pos; + __entry->end = end_pos; + __entry->err = err; + ), + + TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d", + show_dev_ino(__entry), + __entry->start, + __entry->end, + __entry->err) +); + TRACE_EVENT(f2fs_fallocate, TP_PROTO(struct inode *inode, int mode, @@ -1274,6 +1371,13 @@ DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, TP_ARGS(dev, blkstart, blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_remove_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), -- cgit v1.2.3 From 67a8ab4adc2d30e3f3442a0659b887b4da83f7c2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Oct 2017 21:23:19 +0000 Subject: UPSTREAM: arm64: vdso: fix clock_getres for 4GiB-aligned res (cherry pick from commit c80ed088a519da53f27b798a69748eaabc66aadf) The vdso tries to check for a NULL res pointer in __kernel_clock_getres, but only checks the lower 32 bits as is uses CBZ on the W register the res pointer is held in. Thus, if the res pointer happened to be aligned to a 4GiB boundary, we'd spuriously skip storing the timespec to it, while returning a zero error code to the caller. Prevent this by checking the whole pointer, using CBZ on the X register the res pointer is held in. Fixes: 9031fefde6f2ac1d ("arm64: VDSO support") Signed-off-by: Mark Rutland Reported-by: Andrew Pinski Reported-by: Mark Salyzyn Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Will Deacon Bug: 20045882 Bug: 63737556 Change-Id: Iab5449d8515f9d655e792e3d7ce43a8f016fa2a0 --- arch/arm64/kernel/vdso/gettimeofday.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index e00b4671bd7c..c97ce91cf023 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -310,7 +310,7 @@ ENTRY(__kernel_clock_getres) b.ne 4f ldr x2, 6f 2: - cbz w1, 3f + cbz x1, 3f stp xzr, x2, [x1] 3: /* res == NULL. */ -- cgit v1.2.3 From 28850c79d071b4eef0001bb76af2cfb29402f4ac Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 8 Jun 2017 16:44:21 -0700 Subject: BACKPORT: time: Fix CLOCK_MONOTONIC_RAW sub-nanosecond accounting (cherry pick from commit 3d88d56c5873f6eebe23e05c3da701960146b801) Due to how the MONOTONIC_RAW accumulation logic was handled, there is the potential for a 1ns discontinuity when we do accumulations. This small discontinuity has for the most part gone un-noticed, but since ARM64 enabled CLOCK_MONOTONIC_RAW in their vDSO clock_gettime implementation, we've seen failures with the inconsistency-check test in kselftest. This patch addresses the issue by using the same sub-ns accumulation handling that CLOCK_MONOTONIC uses, which avoids the issue for in-kernel users. Since the ARM64 vDSO implementation has its own clock_gettime calculation logic, this patch reduces the frequency of errors, but failures are still seen. The ARM64 vDSO will need to be updated to include the sub-nanosecond xtime_nsec values in its calculation for this issue to be completely fixed. Signed-off-by: John Stultz Tested-by: Daniel Mentz Cc: Prarit Bhargava Cc: Kevin Brodsky Cc: Richard Cochran Cc: Stephen Boyd Cc: Will Deacon Cc: "stable #4 . 8+" Cc: Miroslav Lichvar Link: http://lkml.kernel.org/r/1496965462-20003-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Bug: 20045882 Bug: 63737556 Change-Id: I6c55dd7685f6bd212c6af9d09c527528e1dd5fa1 --- include/linux/timekeeper_internal.h | 4 ++-- kernel/time/timekeeping.c | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index f0f1793cfa49..115216ec7cfe 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -56,7 +56,7 @@ struct tk_read_base { * interval. * @xtime_remainder: Shifted nano seconds left over when rounding * @cycle_interval - * @raw_interval: Raw nano seconds accumulated per NTP interval. + * @raw_interval: Shifted raw nano seconds accumulated per NTP interval. * @ntp_error: Difference between accumulated time and NTP time in ntp * shifted nano seconds. * @ntp_error_shift: Shift conversion between clock shifted nano seconds and @@ -97,7 +97,7 @@ struct timekeeper { cycle_t cycle_interval; u64 xtime_interval; s64 xtime_remainder; - u32 raw_interval; + u64 raw_interval; /* The ntp_tick_length() value currently being used. * This cached copy ensures we consistently apply the tick * length for an entire tick, as ntp_tick_length may change diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 738f3467d169..5a514e6002d2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -277,8 +277,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* Go back from cycles -> shifted ns */ tk->xtime_interval = (u64) interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; - tk->raw_interval = - ((u64) interval * clock->mult) >> clock->shift; + tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { @@ -1796,7 +1795,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, unsigned int *clock_set) { cycle_t interval = tk->cycle_interval << shift; - u64 raw_nsecs; + u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) @@ -1811,14 +1810,15 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = (u64)tk->raw_interval << shift; - raw_nsecs += tk->raw_time.tv_nsec; - if (raw_nsecs >= NSEC_PER_SEC) { - u64 raw_secs = raw_nsecs; - raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - tk->raw_time.tv_sec += raw_secs; + tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; + tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; + snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { + tk->tkr_raw.xtime_nsec -= snsec_per_sec; + tk->raw_time.tv_sec++; } - tk->raw_time.tv_nsec = raw_nsecs; + tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; + tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; -- cgit v1.2.3 From 1d35c0438678c7ad4c367135082685d5754eed20 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 22 May 2017 17:20:20 -0700 Subject: BACKPORT: time: Clean up CLOCK_MONOTONIC_RAW time handling (cherry pick from commit fc6eead7c1e2e5376c25d2795d4539fdacbc0648) Now that we fixed the sub-ns handling for CLOCK_MONOTONIC_RAW, remove the duplicitive tk->raw_time.tv_nsec, which can be stored in tk->tkr_raw.xtime_nsec (similarly to how its handled for monotonic time). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Miroslav Lichvar Cc: Richard Cochran Cc: Prarit Bhargava Cc: Stephen Boyd Cc: Kevin Brodsky Cc: Will Deacon Cc: Daniel Mentz Tested-by: Daniel Mentz Signed-off-by: John Stultz Bug: 20045882 Bug: 63737556 Change-Id: I243827d21b08703a09d2d2fe738a9258be224582 --- arch/arm64/kernel/vdso.c | 4 ++-- include/linux/timekeeper_internal.h | 4 ++-- kernel/time/timekeeping.c | 45 ++++++++++++++++++++----------------- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index b7730ba782dd..7e9dd94452bb 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -218,8 +218,8 @@ void update_vsyscall(struct timekeeper *tk) if (!use_syscall) { /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; - vdso_data->raw_time_sec = tk->raw_time.tv_sec; - vdso_data->raw_time_nsec = tk->raw_time.tv_nsec; + vdso_data->raw_time_sec = tk->raw_sec; + vdso_data->raw_time_nsec = tk->tkr_raw.xtime_nsec; vdso_data->xtime_clock_sec = tk->xtime_sec; vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; /* tkr_raw.xtime_nsec == 0 */ diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 115216ec7cfe..3a5af09af18b 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -50,7 +50,7 @@ struct tk_read_base { * @tai_offset: The current UTC to TAI offset in seconds * @clock_was_set_seq: The sequence number of clock was set events * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second - * @raw_time: Monotonic raw base time in timespec64 format + * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP * interval. @@ -91,7 +91,7 @@ struct timekeeper { s32 tai_offset; unsigned int clock_was_set_seq; ktime_t next_leap_ktime; - struct timespec64 raw_time; + u64 raw_sec; /* The following members are for timekeeping internal use */ cycle_t cycle_interval; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5a514e6002d2..fc86fdcce932 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -70,6 +70,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } + while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { + tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + tk->raw_sec++; + } } static inline struct timespec64 tk_xtime(struct timekeeper *tk) @@ -282,12 +286,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; - if (shift_change < 0) + if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; - else + tk->tkr_raw.xtime_nsec >>= -shift_change; + } else { tk->tkr_mono.xtime_nsec <<= shift_change; + tk->tkr_raw.xtime_nsec <<= shift_change; + } } - tk->tkr_raw.xtime_nsec = 0; tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; @@ -616,9 +622,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); - /* Update the monotonic raw base */ - tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); - /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take @@ -628,6 +631,11 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; + + /* Update the monotonic raw base */ + seconds = tk->raw_sec; + nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); + tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); } /* must hold timekeeper_lock */ @@ -669,7 +677,6 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) static void timekeeping_forward_now(struct timekeeper *tk) { cycle_t cycle_now, delta; - s64 nsec; cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); @@ -681,10 +688,13 @@ static void timekeeping_forward_now(struct timekeeper *tk) /* If arch requires, add in get_arch_timeoffset() */ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; - tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); - timespec64_add_ns(&tk->raw_time, nsec); + tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + + /* If arch requires, add in get_arch_timeoffset() */ + tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; + + tk_normalize_xtime(tk); } /** @@ -1178,19 +1188,18 @@ int timekeeping_notify(struct clocksource *clock) void getrawmonotonic64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts64; unsigned long seq; s64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); - ts64 = tk->raw_time; } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec64_add_ns(&ts64, nsecs); - *ts = ts64; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic64); @@ -1314,8 +1323,7 @@ void __init timekeeping_init(void) tk_setup_internals(tk, clock); tk_set_xtime(tk, &now); - tk->raw_time.tv_sec = 0; - tk->raw_time.tv_nsec = 0; + tk->raw_sec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); @@ -1810,15 +1818,12 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { tk->tkr_raw.xtime_nsec -= snsec_per_sec; - tk->raw_time.tv_sec++; + tk->raw_sec++; } - tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift; - tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; -- cgit v1.2.3 From 7ddbe701076d4ff2ba15c1a111bd41681594819e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 9 Nov 2017 12:29:34 +0100 Subject: s390: fix transactional execution control register handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a1c5befc1c24eb9c1ee83f711e0f21ee79cbb556 upstream. Dan Horák reported the following crash related to transactional execution: User process fault: interruption code 0013 ilc:3 in libpthread-2.26.so[3ff93c00000+1b000] CPU: 2 PID: 1 Comm: /init Not tainted 4.13.4-300.fc27.s390x #1 Hardware name: IBM 2827 H43 400 (z/VM 6.4.0) task: 00000000fafc8000 task.stack: 00000000fafc4000 User PSW : 0705200180000000 000003ff93c14e70 R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:1 AS:0 CC:2 PM:0 RI:0 EA:3 User GPRS: 0000000000000077 000003ff00000000 000003ff93144d48 000003ff93144d5e 0000000000000000 0000000000000002 0000000000000000 000003ff00000000 0000000000000000 0000000000000418 0000000000000000 000003ffcc9fe770 000003ff93d28f50 000003ff9310acf0 000003ff92b0319a 000003ffcc9fe6d0 User Code: 000003ff93c14e62: 60e0b030 std %f14,48(%r11) 000003ff93c14e66: 60f0b038 std %f15,56(%r11) #000003ff93c14e6a: e5600000ff0e tbegin 0,65294 >000003ff93c14e70: a7740006 brc 7,3ff93c14e7c 000003ff93c14e74: a7080000 lhi %r0,0 000003ff93c14e78: a7f40023 brc 15,3ff93c14ebe 000003ff93c14e7c: b2220000 ipm %r0 000003ff93c14e80: 8800001c srl %r0,28 There are several bugs with control register handling with respect to transactional execution: - on task switch update_per_regs() is only called if the next task has an mm (is not a kernel thread). This however is incorrect. This breaks e.g. for user mode helper handling, where the kernel creates a kernel thread and then execve's a user space program. Control register contents related to transactional execution won't be updated on execve. If the previous task ran with transactional execution disabled then the new task will also run with transactional execution disabled, which is incorrect. Therefore call update_per_regs() unconditionally within switch_to(). - on startup the transactional execution facility is not enabled for the idle thread. This is not really a bug, but an inconsistency to other facilities. Therefore enable the facility if it is available. - on fork the new thread's per_flags field is not cleared. This means that a child process inherits the PER_FLAG_NO_TE flag. This flag can be set with a ptrace request to disable transactional execution for the current process. It should not be inherited by new child processes in order to be consistent with the handling of all other PER related debugging options. Therefore clear the per_flags field in copy_thread_tls(). Reported-and-tested-by: Dan Horák Fixes: d35339a42dd1 ("s390: add support for transactional memory") Cc: Martin Schwidefsky Reviewed-by: Christian Borntraeger Reviewed-by: Hendrik Brueckner Signed-off-by: Heiko Carstens Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/switch_to.h | 2 +- arch/s390/kernel/early.c | 4 +++- arch/s390/kernel/process.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index 12d45f0cfdd9..dde6b52359c5 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h @@ -34,8 +34,8 @@ static inline void restore_access_regs(unsigned int *acrs) save_access_regs(&prev->thread.acrs[0]); \ save_ri_cb(prev->thread.ri_cb); \ } \ + update_cr_regs(next); \ if (next->mm) { \ - update_cr_regs(next); \ set_cpu_flag(CIF_FPU); \ restore_access_regs(&next->thread.acrs[0]); \ restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 3c31609df959..ee7b8e7ca4f8 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -325,8 +325,10 @@ static __init void detect_machine_facilities(void) S390_lowcore.machine_flags |= MACHINE_FLAG_IDTE; if (test_facility(40)) S390_lowcore.machine_flags |= MACHINE_FLAG_LPP; - if (test_facility(50) && test_facility(73)) + if (test_facility(50) && test_facility(73)) { S390_lowcore.machine_flags |= MACHINE_FLAG_TE; + __ctl_set_bit(0, 55); + } if (test_facility(51)) S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; if (test_facility(129)) { diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 114ee8b96f17..efa035a31b98 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -137,6 +137,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, memset(&p->thread.per_user, 0, sizeof(p->thread.per_user)); memset(&p->thread.per_event, 0, sizeof(p->thread.per_event)); clear_tsk_thread_flag(p, TIF_SINGLE_STEP); + p->thread.per_flags = 0; /* Initialize per thread user and system timer values */ ti = task_thread_info(p); ti->user_timer = 0; -- cgit v1.2.3 From 04bc7a273264ce3a528b97af72a6473bc4a13fd7 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 11 Sep 2017 11:24:22 +0200 Subject: s390/runtime instrumention: fix possible memory corruption commit d6e646ad7cfa7034d280459b2b2546288f247144 upstream. For PREEMPT enabled kernels the runtime instrumentation (RI) code contains a possible use-after-free bug. If a task that makes use of RI exits, it will execute do_exit() while still enabled for preemption. That function will call exit_thread_runtime_instr() via exit_thread(). If exit_thread_runtime_instr() gets preempted after the RI control block of the task has been freed but before the pointer to it is set to NULL, then save_ri_cb(), called from switch_to(), will write to already freed memory. Avoid this and simply disable preemption while freeing the control block and setting the pointer to NULL. Fixes: e4b8b3f33fca ("s390: add support for runtime instrumentation") Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/kernel/runtime_instr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c index fffa0e5462af..70cdb03d4acd 100644 --- a/arch/s390/kernel/runtime_instr.c +++ b/arch/s390/kernel/runtime_instr.c @@ -47,11 +47,13 @@ void exit_thread_runtime_instr(void) { struct task_struct *task = current; + preempt_disable(); if (!task->thread.ri_cb) return; disable_runtime_instr(); kfree(task->thread.ri_cb); task->thread.ri_cb = NULL; + preempt_enable(); } SYSCALL_DEFINE1(s390_runtime_instr, int, command) @@ -62,9 +64,7 @@ SYSCALL_DEFINE1(s390_runtime_instr, int, command) return -EOPNOTSUPP; if (command == S390_RUNTIME_INSTR_STOP) { - preempt_disable(); exit_thread_runtime_instr(); - preempt_enable(); return 0; } -- cgit v1.2.3 From 4337fa2425f6b00ba0977444a12dc5439858cb38 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Sep 2017 09:16:48 +0200 Subject: s390/disassembler: add missing end marker for e7 table commit 5c50538752af7968f53924b22dede8ed4ce4cb3b upstream. The e7 opcode table does not have an end marker. Hence when trying to find an unknown e7 instruction the code will access memory behind the table until it finds something that matches the opcode, or the kernel crashes, whatever comes first. This affects not only the in-kernel disassembler but also uprobes and kprobes which refuse to set a probe on unknown instructions, and therefore search the opcode tables to figure out if instructions are known or not. Fixes: 3585cb0280654 ("s390/disassembler: add vector instructions") Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/kernel/dis.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 6e72961608f0..33357e959fed 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -1549,6 +1549,7 @@ static struct s390_insn opcode_e7[] = { { "vfsq", 0xce, INSTR_VRR_VV000MM }, { "vfs", 0xe2, INSTR_VRR_VVV00MM }, { "vftci", 0x4a, INSTR_VRI_VVIMM }, + { "", 0, INSTR_INVALID } }; static struct s390_insn opcode_eb[] = { -- cgit v1.2.3 From 5c2607d3e7cd8d1e4ee96fdc8d18d49a57f9500f Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 15 Nov 2017 14:15:36 +0100 Subject: s390/disassembler: increase show_code buffer size commit b192571d1ae375e0bbe0aa3ccfa1a3c3704454b9 upstream. Current buffer size of 64 is too small. objdump shows that there are instructions which would require up to 75 bytes buffer (with current formating). 128 bytes "ought to be enough for anybody". Also replaces 8 spaces with a single tab to reduce the memory footprint. Fixes the following KASAN finding: BUG: KASAN: stack-out-of-bounds in number+0x3fe/0x538 Write of size 1 at addr 000000005a4a75a0 by task bash/1282 CPU: 1 PID: 1282 Comm: bash Not tainted 4.14.0+ #215 Hardware name: IBM 2964 N96 702 (z/VM 6.4.0) Call Trace: ([<000000000011eeb6>] show_stack+0x56/0x88) [<0000000000e1ce1a>] dump_stack+0x15a/0x1b0 [<00000000004e2994>] print_address_description+0xf4/0x288 [<00000000004e2cf2>] kasan_report+0x13a/0x230 [<0000000000e38ae6>] number+0x3fe/0x538 [<0000000000e3dfe4>] vsnprintf+0x194/0x948 [<0000000000e3ea42>] sprintf+0xa2/0xb8 [<00000000001198dc>] print_insn+0x374/0x500 [<0000000000119346>] show_code+0x4ee/0x538 [<000000000011f234>] show_registers+0x34c/0x388 [<000000000011f2ae>] show_regs+0x3e/0xa8 [<000000000011f502>] die+0x1ea/0x2e8 [<0000000000138f0e>] do_no_context+0x106/0x168 [<0000000000139a1a>] do_protection_exception+0x4da/0x7d0 [<0000000000e55914>] pgm_check_handler+0x16c/0x1c0 [<000000000090639e>] sysrq_handle_crash+0x46/0x58 ([<0000000000000007>] 0x7) [<00000000009073fa>] __handle_sysrq+0x102/0x218 [<0000000000907c06>] write_sysrq_trigger+0xd6/0x100 [<000000000061d67a>] proc_reg_write+0xb2/0x128 [<0000000000520be6>] __vfs_write+0xee/0x368 [<0000000000521222>] vfs_write+0x21a/0x278 [<000000000052156a>] SyS_write+0xda/0x178 [<0000000000e555cc>] system_call+0xc4/0x270 The buggy address belongs to the page: page:000003d1016929c0 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x0() raw: 0000000000000000 0000000000000000 0000000000000000 ffffffff00000000 raw: 0000000000000100 0000000000000200 0000000000000000 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: 000000005a4a7480: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 000000005a4a7500: 00 00 00 00 00 00 00 00 f2 f2 f2 f2 00 00 00 00 >000000005a4a7580: 00 00 00 00 f3 f3 f3 f3 00 00 00 00 00 00 00 00 ^ 000000005a4a7600: 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 f8 f8 000000005a4a7680: f2 f2 f2 f2 f2 f2 f8 f8 f2 f2 f3 f3 f3 f3 00 00 ================================================================== Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky Signed-off-by: Greg Kroah-Hartman --- arch/s390/kernel/dis.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index 33357e959fed..07477ba392b7 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -1962,7 +1962,7 @@ void show_code(struct pt_regs *regs) { char *mode = user_mode(regs) ? "User" : "Krnl"; unsigned char code[64]; - char buffer[64], *ptr; + char buffer[128], *ptr; mm_segment_t old_fs; unsigned long addr; int start, end, opsize, hops, i; @@ -2025,7 +2025,7 @@ void show_code(struct pt_regs *regs) start += opsize; printk(buffer); ptr = buffer; - ptr += sprintf(ptr, "\n "); + ptr += sprintf(ptr, "\n\t "); hops++; } printk("\n"); -- cgit v1.2.3 From 2417da3f4d6bc4fc6c77f613f0e2264090892aa5 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 20 Jun 2017 11:42:27 -0700 Subject: ipv6: only call ip6_route_dev_notify() once for NETDEV_UNREGISTER commit 76da0704507bbc51875013f6557877ab308cfd0a upstream. In commit 242d3a49a2a1 ("ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf") I assumed NETDEV_REGISTER and NETDEV_UNREGISTER are paired, unfortunately, as reported by jeffy, netdev_wait_allrefs() could rebroadcast NETDEV_UNREGISTER event until all refs are gone. We have to add an additional check to avoid this corner case. For netdev_wait_allrefs() dev->reg_state is NETREG_UNREGISTERED, for dev_change_net_namespace(), dev->reg_state is NETREG_REGISTERED. So check for dev->reg_state != NETREG_UNREGISTERED. Fixes: 242d3a49a2a1 ("ipv6: reorder ip6_route_dev_notifier after ipv6_dev_notf") Reported-by: jeffy Cc: David Ahern Signed-off-by: Cong Wang Acked-by: David Ahern Signed-off-by: David S. Miller Cc: Konstantin Khlebnikov Signed-off-by: Greg Kroah-Hartman --- net/ipv6/route.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 48917437550e..7336a7311038 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3378,7 +3378,11 @@ static int ip6_route_dev_notify(struct notifier_block *this, net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif - } else if (event == NETDEV_UNREGISTER) { + } else if (event == NETDEV_UNREGISTER && + dev->reg_state != NETREG_UNREGISTERED) { + /* NETDEV_UNREGISTER could be fired for multiple times by + * netdev_wait_allrefs(). Make sure we only call this once. + */ in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); -- cgit v1.2.3 From df24d6c224602ec23ad2626b07e13c40d018cbbc Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 22 Mar 2016 17:05:52 +0100 Subject: AF_VSOCK: Shrink the area influenced by prepare_to_wait commit f7f9b5e7f8eccfd68ffa7b8d74b07c478bb9e7f0 upstream. When a thread is prepared for waiting by calling prepare_to_wait, sleeping is not allowed until either the wait has taken place or finish_wait has been called. The existing code in af_vsock imposed unnecessary no-sleep assumptions to a broad list of backend functions. This patch shrinks the influence of prepare_to_wait to the area where it is strictly needed, therefore relaxing the no-sleep restriction there. Signed-off-by: Claudio Imbrenda Signed-off-by: David S. Miller Cc: "Jorgen S. Hansen" Signed-off-by: Greg Kroah-Hartman --- net/vmw_vsock/af_vsock.c | 158 +++++++++++++++++++++++++---------------------- 1 file changed, 85 insertions(+), 73 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 9b5bd6d142dc..b5f1221f48d4 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1209,10 +1209,14 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - goto out_wait_error; + sk->sk_state = SS_UNCONNECTED; + sock->state = SS_UNCONNECTED; + goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; - goto out_wait_error; + sk->sk_state = SS_UNCONNECTED; + sock->state = SS_UNCONNECTED; + goto out_wait; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); @@ -1220,20 +1224,17 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (sk->sk_err) { err = -sk->sk_err; - goto out_wait_error; - } else + sk->sk_state = SS_UNCONNECTED; + sock->state = SS_UNCONNECTED; + } else { err = 0; + } out_wait: finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; - -out_wait_error: - sk->sk_state = SS_UNCONNECTED; - sock->state = SS_UNCONNECTED; - goto out_wait; } static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) @@ -1270,18 +1271,20 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) listener->sk_err == 0) { release_sock(listener); timeout = schedule_timeout(timeout); + finish_wait(sk_sleep(listener), &wait); lock_sock(listener); if (signal_pending(current)) { err = sock_intr_errno(timeout); - goto out_wait; + goto out; } else if (timeout == 0) { err = -EAGAIN; - goto out_wait; + goto out; } prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE); } + finish_wait(sk_sleep(listener), &wait); if (listener->sk_err) err = -listener->sk_err; @@ -1301,19 +1304,15 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) */ if (err) { vconnected->rejected = true; - release_sock(connected); - sock_put(connected); - goto out_wait; + } else { + newsock->state = SS_CONNECTED; + sock_graft(connected, newsock); } - newsock->state = SS_CONNECTED; - sock_graft(connected, newsock); release_sock(connected); sock_put(connected); } -out_wait: - finish_wait(sk_sleep(listener), &wait); out: release_sock(listener); return err; @@ -1557,11 +1556,11 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (total_written < len) { ssize_t written; + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && @@ -1570,27 +1569,33 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; - goto out_wait; + finish_wait(sk_sleep(sk), &wait); + goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); - if (err < 0) - goto out_wait; + if (err < 0) { + finish_wait(sk_sleep(sk), &wait); + goto out_err; + } release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); - goto out_wait; + finish_wait(sk_sleep(sk), &wait); + goto out_err; } else if (timeout == 0) { err = -EAGAIN; - goto out_wait; + finish_wait(sk_sleep(sk), &wait); + goto out_err; } prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } + finish_wait(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after @@ -1598,16 +1603,16 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, */ if (sk->sk_err) { err = -sk->sk_err; - goto out_wait; + goto out_err; } else if ((sk->sk_shutdown & SEND_SHUTDOWN) || (vsk->peer_shutdown & RCV_SHUTDOWN)) { err = -EPIPE; - goto out_wait; + goto out_err; } err = transport->notify_send_pre_enqueue(vsk, &send_data); if (err < 0) - goto out_wait; + goto out_err; /* Note that enqueue will only write as many bytes as are free * in the produce queue, so we don't need to ensure len is @@ -1620,7 +1625,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, len - total_written); if (written < 0) { err = -ENOMEM; - goto out_wait; + goto out_err; } total_written += written; @@ -1628,14 +1633,13 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, err = transport->notify_send_post_enqueue( vsk, written, &send_data); if (err < 0) - goto out_wait; + goto out_err; } -out_wait: +out_err: if (total_written > 0) err = total_written; - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; @@ -1716,21 +1720,61 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (err < 0) goto out; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); while (1) { - s64 ready = vsock_stream_has_data(vsk); + s64 ready; - if (ready < 0) { - /* Invalid queue pair content. XXX This should be - * changed to a connection reset in a later change. - */ + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + ready = vsock_stream_has_data(vsk); - err = -ENOMEM; - goto out_wait; - } else if (ready > 0) { + if (ready == 0) { + if (sk->sk_err != 0 || + (sk->sk_shutdown & RCV_SHUTDOWN) || + (vsk->peer_shutdown & SEND_SHUTDOWN)) { + finish_wait(sk_sleep(sk), &wait); + break; + } + /* Don't wait for non-blocking sockets. */ + if (timeout == 0) { + err = -EAGAIN; + finish_wait(sk_sleep(sk), &wait); + break; + } + + err = transport->notify_recv_pre_block( + vsk, target, &recv_data); + if (err < 0) { + finish_wait(sk_sleep(sk), &wait); + break; + } + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + + if (signal_pending(current)) { + err = sock_intr_errno(timeout); + finish_wait(sk_sleep(sk), &wait); + break; + } else if (timeout == 0) { + err = -EAGAIN; + finish_wait(sk_sleep(sk), &wait); + break; + } + } else { ssize_t read; + finish_wait(sk_sleep(sk), &wait); + + if (ready < 0) { + /* Invalid queue pair content. XXX This should + * be changed to a connection reset in a later + * change. + */ + + err = -ENOMEM; + goto out; + } + err = transport->notify_recv_pre_dequeue( vsk, target, &recv_data); if (err < 0) @@ -1750,42 +1794,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, vsk, target, read, !(flags & MSG_PEEK), &recv_data); if (err < 0) - goto out_wait; + goto out; if (read >= target || flags & MSG_PEEK) break; target -= read; - } else { - if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN) - || (vsk->peer_shutdown & SEND_SHUTDOWN)) { - break; - } - /* Don't wait for non-blocking sockets. */ - if (timeout == 0) { - err = -EAGAIN; - break; - } - - err = transport->notify_recv_pre_block( - vsk, target, &recv_data); - if (err < 0) - break; - - release_sock(sk); - timeout = schedule_timeout(timeout); - lock_sock(sk); - - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - break; - } else if (timeout == 0) { - err = -EAGAIN; - break; - } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } } @@ -1797,8 +1811,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (copied > 0) err = copied; -out_wait: - finish_wait(sk_sleep(sk), &wait); out: release_sock(sk); return err; -- cgit v1.2.3 From 3223ea129170a7799b92ee24862275aa7f6b9d1b Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Fri, 19 May 2017 11:21:59 -0700 Subject: vsock: use new wait API for vsock_stream_sendmsg() commit 499fde662f1957e3cb8d192a94a099ebe19c714b upstream. As reported by Michal, vsock_stream_sendmsg() could still sleep at vsock_stream_has_space() after prepare_to_wait(): vsock_stream_has_space vmci_transport_stream_has_space vmci_qpair_produce_free_space qp_lock qp_acquire_queue_mutex mutex_lock Just switch to the new wait API like we did for commit d9dc8b0f8b4e ("net: fix sleeping for sk_wait_event()"). Reported-by: Michal Kubecek Cc: Stefan Hajnoczi Cc: Jorgen Hansen Cc: "Michael S. Tsirkin" Cc: Claudio Imbrenda Signed-off-by: Cong Wang Reviewed-by: Stefan Hajnoczi Signed-off-by: David S. Miller Cc: "Jorgen S. Hansen" Signed-off-by: Greg Kroah-Hartman --- net/vmw_vsock/af_vsock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index b5f1221f48d4..60324f7c72bd 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1512,8 +1512,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, long timeout; int err; struct vsock_transport_send_notify_data send_data; - - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); sk = sock->sk; vsk = vsock_sk(sk); @@ -1556,11 +1555,10 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) goto out; - while (total_written < len) { ssize_t written; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); while (vsock_stream_has_space(vsk) == 0 && sk->sk_err == 0 && !(sk->sk_shutdown & SEND_SHUTDOWN) && @@ -1569,33 +1567,30 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Don't wait for non-blocking sockets. */ if (timeout == 0) { err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } err = transport->notify_send_pre_block(vsk, &send_data); if (err < 0) { - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } release_sock(sk); - timeout = schedule_timeout(timeout); + timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); lock_sock(sk); if (signal_pending(current)) { err = sock_intr_errno(timeout); - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } else if (timeout == 0) { err = -EAGAIN; - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); goto out_err; } - - prepare_to_wait(sk_sleep(sk), &wait, - TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); /* These checks occur both as part of and after the loop * conditional since we need to check before and after -- cgit v1.2.3 From 8ff3471878f3f8161bff92f73b3ecb35d6c397dc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Sep 2017 08:54:40 -0700 Subject: sched: Make resched_cpu() unconditional commit 7c2102e56a3f7d85b5d8f33efbd7aecc1f36fdd8 upstream. The current implementation of synchronize_sched_expedited() incorrectly assumes that resched_cpu() is unconditional, which it is not. This means that synchronize_sched_expedited() can hang when resched_cpu()'s trylock fails as follows (analysis by Neeraj Upadhyay): o CPU1 is waiting for expedited wait to complete: sync_rcu_exp_select_cpus rdp->exp_dynticks_snap & 0x1 // returns 1 for CPU5 IPI sent to CPU5 synchronize_sched_expedited_wait ret = swait_event_timeout(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); expmask = 0x20, CPU 5 in idle path (in cpuidle_enter()) o CPU5 handles IPI and fails to acquire rq lock. Handles IPI sync_sched_exp_handler resched_cpu returns while failing to try lock acquire rq->lock need_resched is not set o CPU5 calls rcu_idle_enter() and as need_resched is not set, goes to idle (schedule() is not called). o CPU 1 reports RCU stall. Given that resched_cpu() is now used only by RCU, this commit fixes the assumption by making resched_cpu() unconditional. Reported-by: Neeraj Upadhyay Suggested-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt (VMware) Acked-by: Peter Zijlstra (Intel) Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b5d372083624..a99c32946e49 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -600,8 +600,7 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; + raw_spin_lock_irqsave(&rq->lock, flags); resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.3 From 4fdb1637b2083824403de119b07c6bfe6560915d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 7 Nov 2017 14:15:27 -0800 Subject: lib/mpi: call cond_resched() from mpi_powm() loop commit 1d9ddde12e3c9bab7f3d3484eb9446315e3571ca upstream. On a non-preemptible kernel, if KEYCTL_DH_COMPUTE is called with the largest permitted inputs (16384 bits), the kernel spends 10+ seconds doing modular exponentiation in mpi_powm() without rescheduling. If all threads do it, it locks up the system. Moreover, it can cause rcu_sched-stall warnings. Notwithstanding the insanity of doing this calculation in kernel mode rather than in userspace, fix it by calling cond_resched() as each bit from the exponent is processed. It's still noninterruptible, but at least it's preemptible now. Do the cond_resched() once per bit rather than once per MPI limb because each limb might still easily take 100+ milliseconds on slow CPUs. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- lib/mpi/mpi-pow.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/mpi/mpi-pow.c b/lib/mpi/mpi-pow.c index e24388a863a7..468fb7cd1221 100644 --- a/lib/mpi/mpi-pow.c +++ b/lib/mpi/mpi-pow.c @@ -26,6 +26,7 @@ * however I decided to publish this code under the plain GPL. */ +#include #include #include "mpi-internal.h" #include "longlong.h" @@ -256,6 +257,7 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) } e <<= 1; c--; + cond_resched(); } i--; -- cgit v1.2.3 From 29c4b6b4f46d4cc53b4f8c2b45e218ccfa1eea68 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 24 Nov 2017 13:56:30 +0900 Subject: x86/decoder: Add new TEST instruction pattern commit 12a78d43de767eaf8fb272facb7a7b6f2dc6a9df upstream. The kbuild test robot reported this build warning: Warning: arch/x86/tools/test_get_len found difference at :ffffffff8103dd2c Warning: ffffffff8103dd82: f6 09 d8 testb $0xd8,(%rcx) Warning: objdump says 3 bytes, but insn_get_length() says 2 Warning: decoded and checked 1569014 instructions with 1 warnings This sequence seems to be a new instruction not in the opcode map in the Intel SDM. The instruction sequence is "F6 09 d8", means Group3(F6), MOD(00)REG(001)RM(001), and 0xd8. Intel SDM vol2 A.4 Table A-6 said the table index in the group is "Encoding of Bits 5,4,3 of the ModR/M Byte (bits 2,1,0 in parenthesis)" In that table, opcodes listed by the index REG bits as: 000 001 010 011 100 101 110 111 TEST Ib/Iz,(undefined),NOT,NEG,MUL AL/rAX,IMUL AL/rAX,DIV AL/rAX,IDIV AL/rAX So, it seems TEST Ib is assigned to 001. Add the new pattern. Reported-by: kbuild test robot Signed-off-by: Masami Hiramatsu Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/lib/x86-opcode-map.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index d388de72eaca..ec039f2a0c13 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -833,7 +833,7 @@ EndTable GrpTable: Grp3_1 0: TEST Eb,Ib -1: +1: TEST Eb,Ib 2: NOT Eb 3: NEG Eb 4: MUL AL,Eb -- cgit v1.2.3 From 5f8046f7c9e3049000bbe610c15390bf698d681e Mon Sep 17 00:00:00 2001 From: Philip Derrin Date: Tue, 14 Nov 2017 00:55:25 +0100 Subject: ARM: 8722/1: mm: make STRICT_KERNEL_RWX effective for LPAE commit 400eeffaffc7232c0ae1134fe04e14ae4fb48d8c upstream. Currently, for ARM kernels with CONFIG_ARM_LPAE and CONFIG_STRICT_KERNEL_RWX enabled, the 2MiB pages mapping the kernel code and rodata are writable. They are marked read-only in a software bit (L_PMD_SECT_RDONLY) but the hardware read-only bit is not set (PMD_SECT_AP2). For user mappings, the logic that propagates the software bit to the hardware bit is in set_pmd_at(); but for the kernel, section_update() writes the PMDs directly, skipping this logic. The fix is to set PMD_SECT_AP2 for read-only sections in section_update(), at the same time as L_PMD_SECT_RDONLY. Fixes: 1e3479225acb ("ARM: 8275/1: mm: fix PMD_SECT_RDONLY undeclared compile error") Signed-off-by: Philip Derrin Reported-by: Neil Dick Tested-by: Neil Dick Tested-by: Laura Abbott Reviewed-by: Kees Cook Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- arch/arm/mm/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 7f8cd1b3557f..c29ad610311b 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -611,8 +611,8 @@ static struct section_perm ro_perms[] = { .start = (unsigned long)_stext, .end = (unsigned long)__init_begin, #ifdef CONFIG_ARM_LPAE - .mask = ~L_PMD_SECT_RDONLY, - .prot = L_PMD_SECT_RDONLY, + .mask = ~(L_PMD_SECT_RDONLY | PMD_SECT_AP2), + .prot = L_PMD_SECT_RDONLY | PMD_SECT_AP2, #else .mask = ~(PMD_SECT_APX | PMD_SECT_AP_WRITE), .prot = PMD_SECT_APX | PMD_SECT_AP_WRITE, -- cgit v1.2.3 From 36a082ce590f4e2625453d24dfcf98f74d2fe8c7 Mon Sep 17 00:00:00 2001 From: Philip Derrin Date: Tue, 14 Nov 2017 00:55:26 +0100 Subject: ARM: 8721/1: mm: dump: check hardware RO bit for LPAE commit 3b0c0c922ff4be275a8beb87ce5657d16f355b54 upstream. When CONFIG_ARM_LPAE is set, the PMD dump relies on the software read-only bit to determine whether a page is writable. This concealed a bug which left the kernel text section writable (AP2=0) while marked read-only in the software bit. In a kernel with the AP2 bug, the dump looks like this: ---[ Kernel Mapping ]--- 0xc0000000-0xc0200000 2M RW NX SHD 0xc0200000-0xc0600000 4M ro x SHD 0xc0600000-0xc0800000 2M ro NX SHD 0xc0800000-0xc4800000 64M RW NX SHD The fix is to check that the software and hardware bits are both set before displaying "ro". The dump then shows the true perms: ---[ Kernel Mapping ]--- 0xc0000000-0xc0200000 2M RW NX SHD 0xc0200000-0xc0600000 4M RW x SHD 0xc0600000-0xc0800000 2M RW NX SHD 0xc0800000-0xc4800000 64M RW NX SHD Fixes: ded947798469 ("ARM: 8109/1: mm: Modify pte_write and pmd_write logic for LPAE") Signed-off-by: Philip Derrin Tested-by: Neil Dick Reviewed-by: Kees Cook Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- arch/arm/mm/dump.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 9fe8e241335c..e1f6f0daa847 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -126,8 +126,8 @@ static const struct prot_bits section_bits[] = { .val = PMD_SECT_USER, .set = "USR", }, { - .mask = L_PMD_SECT_RDONLY, - .val = L_PMD_SECT_RDONLY, + .mask = L_PMD_SECT_RDONLY | PMD_SECT_AP2, + .val = L_PMD_SECT_RDONLY | PMD_SECT_AP2, .set = "ro", .clear = "RW", #elif __LINUX_ARM_ARCH__ >= 6 -- cgit v1.2.3 From 84c785ed786a61dfd35ea3e818ead7dc0adddfd2 Mon Sep 17 00:00:00 2001 From: Mathias Kresin Date: Thu, 11 May 2017 08:11:14 +0200 Subject: MIPS: ralink: Fix MT7628 pinmux commit 8ef4b43cd3794d63052d85898e42424fd3b14d24 upstream. According to the datasheet the REFCLK pin is shared with GPIO#37 and the PERST pin is shared with GPIO#36. Fixes: 53263a1c6852 ("MIPS: ralink: add mt7628an support") Signed-off-by: Mathias Kresin Acked-by: John Crispin Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/16046/ Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/ralink/mt7620.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/ralink/mt7620.c b/arch/mips/ralink/mt7620.c index 48d6349fd9d7..9fd401c304b1 100644 --- a/arch/mips/ralink/mt7620.c +++ b/arch/mips/ralink/mt7620.c @@ -141,8 +141,8 @@ static struct rt2880_pmx_func i2c_grp_mt7628[] = { FUNC("i2c", 0, 4, 2), }; -static struct rt2880_pmx_func refclk_grp_mt7628[] = { FUNC("reclk", 0, 36, 1) }; -static struct rt2880_pmx_func perst_grp_mt7628[] = { FUNC("perst", 0, 37, 1) }; +static struct rt2880_pmx_func refclk_grp_mt7628[] = { FUNC("reclk", 0, 37, 1) }; +static struct rt2880_pmx_func perst_grp_mt7628[] = { FUNC("perst", 0, 36, 1) }; static struct rt2880_pmx_func wdt_grp_mt7628[] = { FUNC("wdt", 0, 38, 1) }; static struct rt2880_pmx_func spi_grp_mt7628[] = { FUNC("spi", 0, 7, 4) }; -- cgit v1.2.3 From 0c1faf9df0c840df75a4e1ee6a1c967f6ff10428 Mon Sep 17 00:00:00 2001 From: Mathias Kresin Date: Thu, 11 May 2017 08:11:15 +0200 Subject: MIPS: ralink: Fix typo in mt7628 pinmux function commit 05a67cc258e75ac9758e6f13d26337b8be51162a upstream. There is a typo inside the pinmux setup code. The function is called refclk and not reclk. Fixes: 53263a1c6852 ("MIPS: ralink: add mt7628an support") Signed-off-by: Mathias Kresin Acked-by: John Crispin Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/16047/ Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/ralink/mt7620.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/ralink/mt7620.c b/arch/mips/ralink/mt7620.c index 9fd401c304b1..c5f45fc96c74 100644 --- a/arch/mips/ralink/mt7620.c +++ b/arch/mips/ralink/mt7620.c @@ -141,7 +141,7 @@ static struct rt2880_pmx_func i2c_grp_mt7628[] = { FUNC("i2c", 0, 4, 2), }; -static struct rt2880_pmx_func refclk_grp_mt7628[] = { FUNC("reclk", 0, 37, 1) }; +static struct rt2880_pmx_func refclk_grp_mt7628[] = { FUNC("refclk", 0, 37, 1) }; static struct rt2880_pmx_func perst_grp_mt7628[] = { FUNC("perst", 0, 36, 1) }; static struct rt2880_pmx_func wdt_grp_mt7628[] = { FUNC("wdt", 0, 38, 1) }; static struct rt2880_pmx_func spi_grp_mt7628[] = { FUNC("spi", 0, 7, 4) }; -- cgit v1.2.3 From a9f066404fd096351320a16660dd47a8a868a220 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Thu, 23 Nov 2017 20:07:00 +0530 Subject: ALSA: hda: Add Raven PCI ID commit 9ceace3c9c18c67676e75141032a65a8e01f9a7a upstream. This commit adds PCI ID for Raven platform Signed-off-by: Vijendar Mukunda Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_intel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e6de496bffbe..e2e08fc73b50 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2316,6 +2316,9 @@ static const struct pci_device_id azx_ids[] = { /* AMD Hudson */ { PCI_DEVICE(0x1022, 0x780d), .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB }, + /* AMD Raven */ + { PCI_DEVICE(0x1022, 0x15e3), + .driver_data = AZX_DRIVER_GENERIC | AZX_DCAPS_PRESET_ATI_SB }, /* ATI HDMI */ { PCI_DEVICE(0x1002, 0x0002), .driver_data = AZX_DRIVER_ATIHDMI_NS | AZX_DCAPS_PRESET_ATI_HDMI_NS }, -- cgit v1.2.3 From 36c4819abc92e6cb22de81568407a92cdb8e9363 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 15 Nov 2017 16:38:09 -0800 Subject: dm bufio: fix integer overflow when limiting maximum cache size commit 74d4108d9e681dbbe4a2940ed8fdff1f6868184c upstream. The default max_cache_size_bytes for dm-bufio is meant to be the lesser of 25% of the size of the vmalloc area and 2% of the size of lowmem. However, on 32-bit systems the intermediate result in the expression (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100 overflows, causing the wrong result to be computed. For example, on a 32-bit system where the vmalloc area is 520093696 bytes, the result is 1174405 rather than the expected 130023424, which makes the maximum cache size much too small (far less than 2% of lowmem). This causes severe performance problems for dm-verity users on affected systems. Fix this by using mult_frac() to correctly multiply by a percentage. Do this for all places in dm-bufio that multiply by a percentage. Also replace (VMALLOC_END - VMALLOC_START) with VMALLOC_TOTAL, which contrary to the comment is now defined in include/linux/vmalloc.h. Depends-on: 9993bc635 ("sched/x86: Fix overflow in cyc2ns_offset") Fixes: 95d402f057f2 ("dm: add bufio") Signed-off-by: Eric Biggers Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-bufio.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index cdceefd0e57d..2ec7f90e3455 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -928,7 +928,8 @@ static void __get_memory_limit(struct dm_bufio_client *c, buffers = c->minimum_buffers; *limit_buffers = buffers; - *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; + *threshold_buffers = mult_frac(buffers, + DM_BUFIO_WRITEBACK_PERCENT, 100); } /* @@ -1829,19 +1830,15 @@ static int __init dm_bufio_init(void) memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); - mem = (__u64)((totalram_pages - totalhigh_pages) * - DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; + mem = (__u64)mult_frac(totalram_pages - totalhigh_pages, + DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT; if (mem > ULONG_MAX) mem = ULONG_MAX; #ifdef CONFIG_MMU - /* - * Get the size of vmalloc space the same way as VMALLOC_TOTAL - * in fs/proc/internal.h - */ - if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) - mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; + if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100)) + mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100); #endif dm_bufio_default_cache_size = mem; -- cgit v1.2.3 From 4e82464aa4a398207e2ecbc4877c82319ecdbafa Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 1 Nov 2017 15:42:36 +0800 Subject: dm: fix race between dm_get_from_kobject() and __dm_destroy() commit b9a41d21dceadf8104812626ef85dc56ee8a60ed upstream. The following BUG_ON was hit when testing repeat creation and removal of DM devices: kernel BUG at drivers/md/dm.c:2919! CPU: 7 PID: 750 Comm: systemd-udevd Not tainted 4.1.44 Call Trace: [] dm_get_from_kobject+0x34/0x3a [] dm_attr_show+0x2b/0x5e [] ? mutex_lock+0x26/0x44 [] sysfs_kf_seq_show+0x83/0xcf [] kernfs_seq_show+0x23/0x25 [] seq_read+0x16f/0x325 [] kernfs_fop_read+0x3a/0x13f [] __vfs_read+0x26/0x9d [] ? security_file_permission+0x3c/0x44 [] ? rw_verify_area+0x83/0xd9 [] vfs_read+0x8f/0xcf [] ? __fdget_pos+0x12/0x41 [] SyS_read+0x4b/0x76 [] system_call_fastpath+0x12/0x71 The bug can be easily triggered, if an extra delay (e.g. 10ms) is added between the test of DMF_FREEING & DMF_DELETING and dm_get() in dm_get_from_kobject(). To fix it, we need to ensure the test of DMF_FREEING & DMF_DELETING and dm_get() are done in an atomic way, so _minor_lock is used. The other callers of dm_get() have also been checked to be OK: some callers invoke dm_get() under _minor_lock, some callers invoke it under _hash_lock, and dm_start_request() invoke it after increasing md->open_count. Signed-off-by: Hou Tao Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 320eb3c4bb6b..9ec6948e3b8b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -3507,11 +3507,15 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj) md = container_of(kobj, struct mapped_device, kobj_holder.kobj); - if (test_bit(DMF_FREEING, &md->flags) || - dm_deleting_md(md)) - return NULL; - + spin_lock(&_minor_lock); + if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) { + md = NULL; + goto out; + } dm_get(md); +out: + spin_unlock(&_minor_lock); + return md; } -- cgit v1.2.3 From 00fd53bc378fdeec3caece086de4eadd798b2346 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 7 Nov 2017 19:09:20 +0000 Subject: MIPS: Fix an n32 core file generation regset support regression commit 547da673173de51f73887377eb275304775064ad upstream. Fix a commit 7aeb753b5353 ("MIPS: Implement task_user_regset_view.") regression, then activated by commit 6a9c001b7ec3 ("MIPS: Switch ELF core dumper to use regsets.)", that caused n32 processes to dump o32 core files by failing to set the EF_MIPS_ABI2 flag in the ELF core file header's `e_flags' member: $ file tls-core tls-core: ELF 32-bit MSB executable, MIPS, N32 MIPS64 rel2 version 1 (SYSV), [...] $ ./tls-core Aborted (core dumped) $ file core core: ELF 32-bit MSB core file MIPS, MIPS-I version 1 (SYSV), SVR4-style $ Previously the flag was set as the result of a: statement placed in arch/mips/kernel/binfmt_elfn32.c, however in the regset case, i.e. when CORE_DUMP_USE_REGSET is set, ELF_CORE_EFLAGS is no longer used by `fill_note_info' in fs/binfmt_elf.c, and instead the `->e_flags' member of the regset view chosen is. We have the views defined in arch/mips/kernel/ptrace.c, however only an o32 and an n64 one, and the latter is used for n32 as well. Consequently an o32 core file is incorrectly dumped from n32 processes (the ELF32 vs ELF64 class is chosen elsewhere, and the 32-bit one is correctly selected for n32). Correct the issue then by defining an n32 regset view and using it as appropriate. Issue discovered in GDB testing. Fixes: 7aeb753b5353 ("MIPS: Implement task_user_regset_view.") Signed-off-by: Maciej W. Rozycki Cc: Ralf Baechle Cc: Djordje Todorovic Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17617/ Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/ptrace.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 24c115a0721a..a3f38e6b7ea1 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -650,6 +650,19 @@ static const struct user_regset_view user_mips64_view = { .n = ARRAY_SIZE(mips64_regsets), }; +#ifdef CONFIG_MIPS32_N32 + +static const struct user_regset_view user_mipsn32_view = { + .name = "mipsn32", + .e_flags = EF_MIPS_ABI2, + .e_machine = ELF_ARCH, + .ei_osabi = ELF_OSABI, + .regsets = mips64_regsets, + .n = ARRAY_SIZE(mips64_regsets), +}; + +#endif /* CONFIG_MIPS32_N32 */ + #endif /* CONFIG_64BIT */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) @@ -660,6 +673,10 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #ifdef CONFIG_MIPS32_O32 if (test_tsk_thread_flag(task, TIF_32BIT_REGS)) return &user_mips_view; +#endif +#ifdef CONFIG_MIPS32_N32 + if (test_tsk_thread_flag(task, TIF_32BIT_ADDR)) + return &user_mipsn32_view; #endif return &user_mips64_view; #endif -- cgit v1.2.3 From 153142963ca1fa2b2f3bf3f687e00ef1a63bff5f Mon Sep 17 00:00:00 2001 From: Mirko Parthey Date: Thu, 18 May 2017 21:30:03 +0200 Subject: MIPS: BCM47XX: Fix LED inversion for WRT54GSv1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 56a46acf62af5ba44fca2f3f1c7c25a2d5385b19 upstream. The WLAN LED on the Linksys WRT54GSv1 is active low, but the software treats it as active high. Fix the inverted logic. Fixes: 7bb26b169116 ("MIPS: BCM47xx: Fix LEDs on WRT54GS V1.0") Signed-off-by: Mirko Parthey Looks-ok-by: Rafał Miłecki Cc: Hauke Mehrtens Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/16071/ Signed-off-by: James Hogan Signed-off-by: Greg Kroah-Hartman --- arch/mips/bcm47xx/leds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/bcm47xx/leds.c b/arch/mips/bcm47xx/leds.c index d20ae63eb3c2..46abe9e4e0e0 100644 --- a/arch/mips/bcm47xx/leds.c +++ b/arch/mips/bcm47xx/leds.c @@ -330,7 +330,7 @@ bcm47xx_leds_linksys_wrt54g3gv2[] __initconst = { /* Verified on: WRT54GS V1.0 */ static const struct gpio_led bcm47xx_leds_linksys_wrt54g_type_0101[] __initconst = { - BCM47XX_GPIO_LED(0, "green", "wlan", 0, LEDS_GPIO_DEFSTATE_OFF), + BCM47XX_GPIO_LED(0, "green", "wlan", 1, LEDS_GPIO_DEFSTATE_OFF), BCM47XX_GPIO_LED(1, "green", "power", 0, LEDS_GPIO_DEFSTATE_ON), BCM47XX_GPIO_LED(7, "green", "dmz", 1, LEDS_GPIO_DEFSTATE_OFF), }; -- cgit v1.2.3 From 9a4e08c634cebe28bfb96ec9482ed309f7c9b679 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 17 Nov 2017 15:29:13 -0800 Subject: autofs: don't fail mount for transient error commit ecc0c469f27765ed1e2b967be0aa17cee1a60b76 upstream. Currently if the autofs kernel module gets an error when writing to the pipe which links to the daemon, then it marks the whole moutpoint as catatonic, and it will stop working. It is possible that the error is transient. This can happen if the daemon is slow and more than 16 requests queue up. If a subsequent process tries to queue a request, and is then signalled, the write to the pipe will return -ERESTARTSYS and autofs will take that as total failure. So change the code to assess -ERESTARTSYS and -ENOMEM as transient failures which only abort the current request, not the whole mountpoint. It isn't a crash or a data corruption, but having autofs mountpoints suddenly stop working is rather inconvenient. Ian said: : And given the problems with a half dozen (or so) user space applications : consuming large amounts of CPU under heavy mount and umount activity this : could happen more easily than we expect. Link: http://lkml.kernel.org/r/87y3norvgp.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Acked-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/autofs4/waitq.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 35b755e79c2d..fe6e7050fe50 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -87,7 +87,8 @@ static int autofs4_write(struct autofs_sb_info *sbi, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - return (bytes > 0); + /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ + return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } static void autofs4_notify_daemon(struct autofs_sb_info *sbi, @@ -101,6 +102,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } pkt; struct file *pipe = NULL; size_t pktsz; + int ret; DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); @@ -173,7 +175,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); if (autofs4_write(sbi, pipe, &pkt, pktsz)) + switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + case 0: + break; + case -ENOMEM: + case -ERESTARTSYS: + /* Just fail this one */ + autofs4_wait_release(sbi, wq->wait_queue_token, ret); + break; + default: autofs4_catatonic_mode(sbi); + break; + } fput(pipe); } -- cgit v1.2.3 From 7d7b05e4ffd5ad7cf54b57dfc2f377d9a543c8c6 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Fri, 17 Nov 2017 15:29:35 -0800 Subject: nilfs2: fix race condition that causes file system corruption commit 31ccb1f7ba3cfe29631587d451cf5bb8ab593550 upstream. There is a race condition between nilfs_dirty_inode() and nilfs_set_file_dirty(). When a file is opened, nilfs_dirty_inode() is called to update the access timestamp in the inode. It calls __nilfs_mark_inode_dirty() in a separate transaction. __nilfs_mark_inode_dirty() caches the ifile buffer_head in the i_bh field of the inode info structure and marks it as dirty. After some data was written to the file in another transaction, the function nilfs_set_file_dirty() is called, which adds the inode to the ns_dirty_files list. Then the segment construction calls nilfs_segctor_collect_dirty_files(), which goes through the ns_dirty_files list and checks the i_bh field. If there is a cached buffer_head in i_bh it is not marked as dirty again. Since nilfs_dirty_inode() and nilfs_set_file_dirty() use separate transactions, it is possible that a segment construction that writes out the ifile occurs in-between the two. If this happens the inode is not on the ns_dirty_files list, but its ifile block is still marked as dirty and written out. In the next segment construction, the data for the file is written out and nilfs_bmap_propagate() updates the b-tree. Eventually the bmap root is written into the i_bh block, which is not dirty, because it was written out in another segment construction. As a result the bmap update can be lost, which leads to file system corruption. Either the virtual block address points to an unallocated DAT block, or the DAT entry will be reused for something different. The error can remain undetected for a long time. A typical error message would be one of the "bad btree" errors or a warning that a DAT entry could not be found. This bug can be reproduced reliably by a simple benchmark that creates and overwrites millions of 4k files. Link: http://lkml.kernel.org/r/1509367935-3086-2-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Tested-by: Andreas Rohner Tested-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 2f27c935bd57..34c22fe4eca0 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1945,8 +1945,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, "failed to get inode block.\n"); return err; } - mark_buffer_dirty(ibh); - nilfs_mdt_mark_dirty(ifile); spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) ii->i_bh = ibh; @@ -1955,6 +1953,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, goto retry; } + // Always redirty the buffer to avoid race condition + mark_buffer_dirty(ii->i_bh); + nilfs_mdt_mark_dirty(ifile); + clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); -- cgit v1.2.3 From 9c093a258350e86f781f23fbbef4e3d761c6b1e3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 22 Aug 2017 23:41:28 +0300 Subject: eCryptfs: use after free in ecryptfs_release_messaging() commit db86be3a12d0b6e5c5b51c2ab2a48f06329cb590 upstream. We're freeing the list iterator so we should be using the _safe() version of hlist_for_each_entry(). Fixes: 88b4a07e6610 ("[PATCH] eCryptfs: Public key transport mechanism") Signed-off-by: Dan Carpenter Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/messaging.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 286f10b0363b..4f457d5c4933 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -442,15 +442,16 @@ void ecryptfs_release_messaging(void) } if (ecryptfs_daemon_hash) { struct ecryptfs_daemon *daemon; + struct hlist_node *n; int i; mutex_lock(&ecryptfs_daemon_hash_mux); for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; - hlist_for_each_entry(daemon, - &ecryptfs_daemon_hash[i], - euid_chain) { + hlist_for_each_entry_safe(daemon, n, + &ecryptfs_daemon_hash[i], + euid_chain) { rc = ecryptfs_exorcise_daemon(daemon); if (rc) printk(KERN_ERR "%s: Error whilst " -- cgit v1.2.3 From 85c79043808d400ace74c99526fb9f4f67253102 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 13 Oct 2017 16:35:29 -0700 Subject: bcache: check ca->alloc_thread initialized before wake up it commit 91af8300d9c1d7c6b6a2fd754109e08d4798b8d8 upstream. In bcache code, sysfs entries are created before all resources get allocated, e.g. allocation thread of a cache set. There is posibility for NULL pointer deference if a resource is accessed but which is not initialized yet. Indeed Jorg Bornschein catches one on cache set allocation thread and gets a kernel oops. The reason for this bug is, when bch_bucket_alloc() is called during cache set registration and attaching, ca->alloc_thread is not properly allocated and initialized yet, call wake_up_process() on ca->alloc_thread triggers NULL pointer deference failure. A simple and fast fix is, before waking up ca->alloc_thread, checking whether it is allocated, and only wake up ca->alloc_thread when it is not NULL. Signed-off-by: Coly Li Reported-by: Jorg Bornschein Cc: Kent Overstreet Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8eeab72b93e2..ea47980949ef 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -406,7 +406,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) finish_wait(&ca->set->bucket_wait, &w); out: - wake_up_process(ca->alloc_thread); + if (ca->alloc_thread) + wake_up_process(ca->alloc_thread); trace_bcache_alloc(ca, reserve); -- cgit v1.2.3 From 4e23be6169767d21f95d5bfd5df8a2c6274ee2d7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 19 Oct 2017 16:47:48 +0200 Subject: isofs: fix timestamps beyond 2027 commit 34be4dbf87fc3e474a842305394534216d428f5d upstream. isofs uses a 'char' variable to load the number of years since 1900 for an inode timestamp. On architectures that use a signed char type by default, this results in an invalid date for anything beyond 2027. This changes the function argument to a 'u8' array, which is defined the same way on all architectures, and unambiguously lets us use years until 2155. This should be backported to all kernels that might still be in use by that date. Signed-off-by: Arnd Bergmann Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/isofs/isofs.h | 2 +- fs/isofs/rock.h | 2 +- fs/isofs/util.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 0ac4c1f73fbd..25177e6bd603 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -103,7 +103,7 @@ static inline unsigned int isonum_733(char *p) /* Ignore bigendian datum due to broken mastering programs */ return get_unaligned_le32(p); } -extern int iso_date(char *, int); +extern int iso_date(u8 *, int); struct inode; /* To make gcc happy */ diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index ed09e2b08637..f835976ce033 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -65,7 +65,7 @@ struct RR_PL_s { }; struct stamp { - char time[7]; + __u8 time[7]; /* actually 6 unsigned, 1 signed */ } __attribute__ ((packed)); struct RR_TF_s { diff --git a/fs/isofs/util.c b/fs/isofs/util.c index 005a15cfd30a..37860fea364d 100644 --- a/fs/isofs/util.c +++ b/fs/isofs/util.c @@ -15,7 +15,7 @@ * to GMT. Thus we should always be correct. */ -int iso_date(char * p, int flag) +int iso_date(u8 *p, int flag) { int year, month, day, hour, minute, second, tz; int crtime; -- cgit v1.2.3 From ab33df42eb3c53711c487c839d36379ffca3aeb9 Mon Sep 17 00:00:00 2001 From: Joshua Watt Date: Tue, 7 Nov 2017 16:25:47 -0600 Subject: NFS: Fix typo in nomigration mount option commit f02fee227e5f21981152850744a6084ff3fa94ee upstream. The option was incorrectly masking off all other options. Signed-off-by: Joshua Watt Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f1268280244e..3149f7e58d6f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1322,7 +1322,7 @@ static int nfs_parse_mount_options(char *raw, mnt->options |= NFS_OPTION_MIGRATION; break; case Opt_nomigration: - mnt->options &= NFS_OPTION_MIGRATION; + mnt->options &= ~NFS_OPTION_MIGRATION; break; /* -- cgit v1.2.3 From 2a2d4b41472c73439adc6b8b55fed212d85f4faa Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 5 Nov 2017 15:45:22 -0500 Subject: nfs: Fix ugly referral attributes commit c05cefcc72416a37eba5a2b35f0704ed758a9145 upstream. Before traversing a referral and performing a mount, the mounted-on directory looks strange: dr-xr-xr-x. 2 4294967294 4294967294 0 Dec 31 1969 dir.0 nfs4_get_referral is wiping out any cached attributes with what was returned via GETATTR(fs_locations), but the bit mask for that operation does not request any file attributes. Retrieve owner and timestamp information so that the memcpy in nfs4_get_referral fills in more attributes. Changes since v1: - Don't request attributes that the client unconditionally replaces - Request only MOUNTED_ON_FILEID or FILEID attribute, not both - encode_fs_locations() doesn't use the third bitmask word Fixes: 6b97fd3da1ea ("NFSv4: Follow a referral") Suggested-by: Pradeep Thomas Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8e425f2c5ddd..6fef53f18dcf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -242,15 +242,12 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE }; const u32 nfs4_fs_locations_bitmap[3] = { - FATTR4_WORD0_TYPE - | FATTR4_WORD0_CHANGE + FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID | FATTR4_WORD0_FILEID | FATTR4_WORD0_FS_LOCATIONS, - FATTR4_WORD1_MODE - | FATTR4_WORD1_NUMLINKS - | FATTR4_WORD1_OWNER + FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED @@ -6351,9 +6348,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct page *page) { struct nfs_server *server = NFS_SERVER(dir); - u32 bitmask[3] = { - [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - }; + u32 bitmask[3]; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -6372,12 +6367,15 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, dprintk("%s: start\n", __func__); + bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* Ask for the fileid of the absent filesystem if mounted_on_fileid * is not supported */ if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + bitmask[0] &= ~FATTR4_WORD0_FILEID; else - bitmask[0] |= FATTR4_WORD0_FILEID; + bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; -- cgit v1.2.3 From a8b8ab79ca4fd69182f5864ddcd59994717e0186 Mon Sep 17 00:00:00 2001 From: Andrew Elble Date: Fri, 3 Nov 2017 14:06:31 -0400 Subject: nfsd: deal with revoked delegations appropriately commit 95da1b3a5aded124dd1bda1e3cdb876184813140 upstream. If a delegation has been revoked by the server, operations using that delegation should error out with NFS4ERR_DELEG_REVOKED in the >4.1 case, and NFS4ERR_BAD_STATEID otherwise. The server needs NFSv4.1 clients to explicitly free revoked delegations. If the server returns NFS4ERR_DELEG_REVOKED, the client will do that; otherwise it may just forget about the delegation and be unable to recover when it later sees SEQ4_STATUS_RECALLABLE_STATE_REVOKED set on a SEQUENCE reply. That can cause the Linux 4.1 client to loop in its stage manager. Signed-off-by: Andrew Elble Reviewed-by: Trond Myklebust Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ca9ebc3242d3..421935f3d909 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3829,7 +3829,8 @@ static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, statei { struct nfs4_stid *ret; - ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); + ret = find_stateid_by_type(cl, s, + NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); if (!ret) return NULL; return delegstateid(ret); @@ -3852,6 +3853,12 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; + if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + nfs4_put_stid(&deleg->dl_stid); + if (cl->cl_minorversion) + status = nfserr_deleg_revoked; + goto out; + } flags = share_access_to_flags(open->op_share_access); status = nfs4_check_delegmode(deleg, flags); if (status) { @@ -4696,6 +4703,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; + bool return_revoked = false; + + /* + * only return revoked delegations if explicitly asked. + * otherwise we report revoked or bad_stateid status. + */ + if (typemask & NFS4_REVOKED_DELEG_STID) + return_revoked = true; + else if (typemask & NFS4_DELEG_STID) + typemask |= NFS4_REVOKED_DELEG_STID; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return nfserr_bad_stateid; @@ -4710,6 +4727,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, *s = find_stateid_by_type(cstate->clp, stateid, typemask); if (!*s) return nfserr_bad_stateid; + if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + nfs4_put_stid(*s); + if (cstate->minorversion) + return nfserr_deleg_revoked; + return nfserr_bad_stateid; + } return nfs_ok; } -- cgit v1.2.3 From 3c260c60d20c51283904462015069063f78b065d Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Thu, 14 Sep 2017 13:17:44 -0500 Subject: rtlwifi: rtl8192ee: Fix memory leak when loading firmware commit 519ce2f933fa14acf69d5c8cabcc18711943d629 upstream. In routine rtl92ee_set_fw_rsvdpagepkt(), the driver allocates an skb, but never calls rtl_cmd_send_packet(), which will free the buffer. All other rtlwifi drivers perform this operation correctly. This problem has been in the driver since it was included in the kernel. Fortunately, each firmware load only leaks 4 buffers, which likely explains why it has not previously been detected. Signed-off-by: Larry Finger Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c index 0708eedd9671..1c69e8140d9d 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c @@ -664,7 +664,7 @@ void rtl92ee_set_fw_rsvdpagepkt(struct ieee80211_hw *hw, bool b_dl_finished) struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_mac *mac = rtl_mac(rtl_priv(hw)); struct sk_buff *skb = NULL; - + bool rtstatus; u32 totalpacketlen; u8 u1rsvdpageloc[5] = { 0 }; bool b_dlok = false; @@ -727,7 +727,9 @@ void rtl92ee_set_fw_rsvdpagepkt(struct ieee80211_hw *hw, bool b_dl_finished) memcpy((u8 *)skb_put(skb, totalpacketlen), &reserved_page_packet, totalpacketlen); - b_dlok = true; + rtstatus = rtl_cmd_send_packet(hw, skb); + if (rtstatus) + b_dlok = true; if (b_dlok) { RT_TRACE(rtlpriv, COMP_POWER, DBG_LOUD , -- cgit v1.2.3 From f1be21021099046fbe288ae957cd75ab1c7a0daf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 6 Nov 2017 14:55:35 +0100 Subject: rtlwifi: fix uninitialized rtlhal->last_suspend_sec time commit 3f2a162fab15aee243178b5308bb5d1206fc4043 upstream. We set rtlhal->last_suspend_sec to an uninitialized stack variable, but unfortunately gcc never warned about this, I only found it while working on another patch. I opened a gcc bug for this. Presumably the value of rtlhal->last_suspend_sec is not all that important, but it does get used, so we probably want the patch backported to stable kernels. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82839 Signed-off-by: Arnd Bergmann Acked-by: Larry Finger Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c index bbb789f8990b..738d541a2255 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c @@ -1377,6 +1377,7 @@ static void _rtl8821ae_get_wakeup_reason(struct ieee80211_hw *hw) ppsc->wakeup_reason = 0; + do_gettimeofday(&ts); rtlhal->last_suspend_sec = ts.tv_sec; switch (fw_reason) { -- cgit v1.2.3 From 189bc689547a57100cb0c5d0e689a8555e98b547 Mon Sep 17 00:00:00 2001 From: Rameshwar Prasad Sahu Date: Thu, 2 Nov 2017 16:31:07 +0530 Subject: ata: fixes kernel crash while tracing ata_eh_link_autopsy event commit f1601113ddc0339a745e702f4fb1ca37d4875e65 upstream. When tracing ata link error event, the kernel crashes when the disk is removed due to NULL pointer access by trace_ata_eh_link_autopsy API. This occurs as the dev is NULL when the disk disappeared. This patch fixes this crash by calling trace_ata_eh_link_autopsy only if "dev" is not NULL. v2 changes: Removed direct passing "link" pointer instead of "dev" in trace API. Signed-off-by: Rameshwar Prasad Sahu Signed-off-by: Tejun Heo Fixes: 255c03d15a29 ("libata: Add tracepoints") Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-eh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 91a9e6af2ec4..75cced210b2a 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -2245,8 +2245,8 @@ static void ata_eh_link_autopsy(struct ata_link *link) if (dev->flags & ATA_DFLAG_DUBIOUS_XFER) eflags |= ATA_EFLAG_DUBIOUS_XFER; ehc->i.action |= ata_eh_speed_down(dev, eflags, all_err_mask); + trace_ata_eh_link_autopsy(dev, ehc->i.action, all_err_mask); } - trace_ata_eh_link_autopsy(dev, ehc->i.action, all_err_mask); DPRINTK("EXIT\n"); } -- cgit v1.2.3 From db12d9b5a18143712362b5ccc4b077bf57b040ec Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 6 Oct 2017 23:09:55 -0400 Subject: ext4: fix interaction between i_size, fallocate, and delalloc after a crash commit 51e3ae81ec58e95f10a98ef3dd6d7bce5d8e35a2 upstream. If there are pending writes subject to delayed allocation, then i_size will show size after the writes have completed, while i_disksize contains the value of i_size on the disk (since the writes have not been persisted to disk). If fallocate(2) is called with the FALLOC_FL_KEEP_SIZE flag, either with or without the FALLOC_FL_ZERO_RANGE flag set, and the new size after the fallocate(2) is between i_size and i_disksize, then after a crash, if a journal commit has resulted in the changes made by the fallocate() call to be persisted after a crash, but the delayed allocation write has not resolved itself, i_size would not be updated, and this would cause the following e2fsck complaint: Inode 12, end of extent exceeds allowed value (logical block 33, physical block 33441, len 7) This can only take place on a sparse file, where the fallocate(2) call is allocating blocks in a range which is before a pending delayed allocation write which is extending i_size. Since this situation is quite rare, and the window in which the crash must take place is typically < 30 seconds, in practice this condition will rarely happen. Nevertheless, it can be triggered in testing, and in particular by xfstests generic/456. Signed-off-by: Theodore Ts'o Reported-by: Amir Goldstein Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 61d5bfc7318c..31a3e480d484 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4818,7 +4818,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { + (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) @@ -4994,7 +4995,8 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } if (!(mode & FALLOC_FL_KEEP_SIZE) && - offset + len > i_size_read(inode)) { + (offset + len > i_size_read(inode) || + offset + len > EXT4_I(inode)->i_disksize)) { new_size = offset + len; ret = inode_newsize_ok(inode, new_size); if (ret) -- cgit v1.2.3 From b71cf750ddd3ab64e8596b9b07ff78a8a669ef30 Mon Sep 17 00:00:00 2001 From: Henrik Eriksson Date: Tue, 21 Nov 2017 09:29:28 +0100 Subject: ALSA: pcm: update tstamp only if audio_tstamp changed commit 20e3f985bb875fea4f86b04eba4b6cc29bfd6b71 upstream. commit 3179f6200188 ("ALSA: core: add .get_time_info") had a side effect of changing the behaviour of the PCM runtime tstamp. Prior to this change tstamp was not updated by snd_pcm_update_hw_ptr0() unless the hw_ptr had moved, after this change tstamp was always updated. For an application using alsa-lib, doing snd_pcm_readi() followed by snd_pcm_status() to estimate the age of the read samples by subtracting status->avail * [sample rate] from status->tstamp this change degraded the accuracy of the estimate on devices where the pcm hw does not provide a granular hw_ptr, e.g., devices using soc-generic-dmaengine-pcm.c and a dma-engine with residue_granularity DMA_RESIDUE_GRANULARITY_DESCRIPTOR. The accuracy of the estimate depended on the latency between the PCM hw completing a period and the driver called snd_pcm_period_elapsed() to notify ALSA core, typically determined by interrupt handling latency. After the change the accuracy of the estimate depended on the latency between the PCM hw completing a period and the application calling snd_pcm_status(), determined by the scheduling of the application process. The maximum error of the estimate is one period length in both cases, but the error average and variance is smaller when it depends on interrupt latency. Instead of always updating tstamp, update it only if audio_tstamp changed. Fixes: 3179f6200188 ("ALSA: core: add .get_time_info") Suggested-by: Pierre-Louis Bossart Signed-off-by: Henrik Eriksson Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/pcm_lib.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 0aca39762ed0..cd20f91326fe 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -264,8 +264,10 @@ static void update_audio_tstamp(struct snd_pcm_substream *substream, runtime->rate); *audio_tstamp = ns_to_timespec(audio_nsecs); } - runtime->status->audio_tstamp = *audio_tstamp; - runtime->status->tstamp = *curr_tstamp; + if (!timespec_equal(&runtime->status->audio_tstamp, audio_tstamp)) { + runtime->status->audio_tstamp = *audio_tstamp; + runtime->status->tstamp = *curr_tstamp; + } /* * re-take a driver timestamp to let apps detect if the reference tstamp -- cgit v1.2.3 From d1316b9d83de1f0ce969206aaa5d3a1a60dc5c37 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 21 Nov 2017 16:55:51 +0100 Subject: ALSA: usb-audio: Add sanity checks to FE parser commit d937cd6790a2bef2d07b500487646bd794c039bb upstream. When the usb-audio descriptor contains the malformed feature unit description with a too short length, the driver may access out-of-bounds. Add a sanity check of the header size at the beginning of parse_audio_feature_unit(). Fixes: 23caaf19b11e ("ALSA: usb-mixer: Add support for Audio Class v2.0") Reported-by: Andrey Konovalov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/mixer.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index a23efc8671d6..7008e74f8235 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -1397,6 +1397,12 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid, __u8 *bmaControls; if (state->mixer->protocol == UAC_VERSION_1) { + if (hdr->bLength < 7) { + usb_audio_err(state->chip, + "unit %u: invalid UAC_FEATURE_UNIT descriptor\n", + unitid); + return -EINVAL; + } csize = hdr->bControlSize; if (!csize) { usb_audio_dbg(state->chip, @@ -1414,6 +1420,12 @@ static int parse_audio_feature_unit(struct mixer_build *state, int unitid, } } else { struct uac2_feature_unit_descriptor *ftr = _ftr; + if (hdr->bLength < 6) { + usb_audio_err(state->chip, + "unit %u: invalid UAC_FEATURE_UNIT descriptor\n", + unitid); + return -EINVAL; + } csize = 4; channels = (hdr->bLength - 6) / 4 - 1; bmaControls = ftr->bmaControls; -- cgit v1.2.3 From 0b6cede2e45500452c5c49d9cfb7d10cb1aca437 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 21 Nov 2017 17:00:32 +0100 Subject: ALSA: usb-audio: Fix potential out-of-bound access at parsing SU commit f658f17b5e0e339935dca23e77e0f3cad591926b upstream. The usb-audio driver may trigger an out-of-bound access at parsing a malformed selector unit, as it checks the header length only after evaluating bNrInPins field, which can be already above the given length. Fix it by adding the length check beforehand. Fixes: 99fc86450c43 ("ALSA: usb-mixer: parse descriptors with structs") Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/mixer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 7008e74f8235..1050008d7719 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -2026,7 +2026,8 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid, const struct usbmix_name_map *map; char **namelist; - if (!desc->bNrInPins || desc->bLength < 5 + desc->bNrInPins) { + if (desc->bLength < 5 || !desc->bNrInPins || + desc->bLength < 5 + desc->bNrInPins) { usb_audio_err(state->chip, "invalid SELECTOR UNIT descriptor %d\n", unitid); return -EINVAL; -- cgit v1.2.3 From 3532750d20f56f0165bd196aee3c6bb9fb5c1fdb Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 21 Nov 2017 17:28:06 +0100 Subject: ALSA: usb-audio: Add sanity checks in v2 clock parsers commit 0a62d6c966956d77397c32836a5bbfe3af786fc1 upstream. The helper functions to parse and look for the clock source, selector and multiplier unit may return the descriptor with a too short length than required, while there is no sanity check in the caller side. Add some sanity checks in the parsers, at least, to guarantee the given descriptor size, for avoiding the potential crashes. Fixes: 79f920fbff56 ("ALSA: usb-audio: parse clock topology of UAC2 devices") Reported-by: Andrey Konovalov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/clock.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sound/usb/clock.c b/sound/usb/clock.c index 7ccbcaf6a147..66294eb64501 100644 --- a/sound/usb/clock.c +++ b/sound/usb/clock.c @@ -43,7 +43,7 @@ static struct uac_clock_source_descriptor * while ((cs = snd_usb_find_csint_desc(ctrl_iface->extra, ctrl_iface->extralen, cs, UAC2_CLOCK_SOURCE))) { - if (cs->bClockID == clock_id) + if (cs->bLength >= sizeof(*cs) && cs->bClockID == clock_id) return cs; } @@ -59,8 +59,11 @@ static struct uac_clock_selector_descriptor * while ((cs = snd_usb_find_csint_desc(ctrl_iface->extra, ctrl_iface->extralen, cs, UAC2_CLOCK_SELECTOR))) { - if (cs->bClockID == clock_id) + if (cs->bLength >= sizeof(*cs) && cs->bClockID == clock_id) { + if (cs->bLength < 5 + cs->bNrInPins) + return NULL; return cs; + } } return NULL; @@ -75,7 +78,7 @@ static struct uac_clock_multiplier_descriptor * while ((cs = snd_usb_find_csint_desc(ctrl_iface->extra, ctrl_iface->extralen, cs, UAC2_CLOCK_MULTIPLIER))) { - if (cs->bClockID == clock_id) + if (cs->bLength >= sizeof(*cs) && cs->bClockID == clock_id) return cs; } -- cgit v1.2.3 From ef67455316481c4a418542495636505a0ab8c88e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 21 Nov 2017 16:36:11 +0100 Subject: ALSA: timer: Remove kernel warning at compat ioctl error paths commit 3d4e8303f2c747c8540a0a0126d0151514f6468b upstream. Some timer compat ioctls have NULL checks of timer instance with snd_BUG_ON() that bring up WARN_ON() when the debug option is set. Actually the condition can be met in the normal situation and it's confusing and bad to spew kernel warnings with stack trace there. Let's remove snd_BUG_ON() invocation and replace with the simple checks. Also, correct the error code to EBADFD to follow the native ioctl error handling. Reported-by: syzbot Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/timer_compat.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sound/core/timer_compat.c b/sound/core/timer_compat.c index 0b4b028e8e98..de9155eed727 100644 --- a/sound/core/timer_compat.c +++ b/sound/core/timer_compat.c @@ -40,11 +40,11 @@ static int snd_timer_user_info_compat(struct file *file, struct snd_timer *t; tu = file->private_data; - if (snd_BUG_ON(!tu->timeri)) - return -ENXIO; + if (!tu->timeri) + return -EBADFD; t = tu->timeri->timer; - if (snd_BUG_ON(!t)) - return -ENXIO; + if (!t) + return -EBADFD; memset(&info, 0, sizeof(info)); info.card = t->card ? t->card->number : -1; if (t->hw.flags & SNDRV_TIMER_HW_SLAVE) @@ -73,8 +73,8 @@ static int snd_timer_user_status_compat(struct file *file, struct snd_timer_status32 status; tu = file->private_data; - if (snd_BUG_ON(!tu->timeri)) - return -ENXIO; + if (!tu->timeri) + return -EBADFD; memset(&status, 0, sizeof(status)); status.tstamp.tv_sec = tu->tstamp.tv_sec; status.tstamp.tv_nsec = tu->tstamp.tv_nsec; -- cgit v1.2.3 From 509ab500a240f42fd35a696e735ca8e787668791 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Wed, 22 Nov 2017 15:21:32 +0800 Subject: ALSA: hda/realtek - Fix ALC700 family no sound issue commit 2d7fe6185722b0817bb345f62ab06b76a7b26542 upstream. It maybe the typo for ALC700 support patch. To fix the bit value on this patch. Fixes: 6fbae35a3170 ("ALSA: hda/realtek - Add support for new codecs ALC700/ALC701/ALC703") Signed-off-by: Kailang Yang Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index af0962307b7f..64085a512dbf 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6254,7 +6254,7 @@ static int patch_alc269(struct hda_codec *codec) case 0x10ec0703: spec->codec_variant = ALC269_TYPE_ALC700; spec->gen.mixer_nid = 0; /* ALC700 does not have any loopback mixer path */ - alc_update_coef_idx(codec, 0x4a, 0, 1 << 15); /* Combo jack auto trigger control */ + alc_update_coef_idx(codec, 0x4a, 1 << 15, 0); /* Combo jack auto trigger control */ break; } -- cgit v1.2.3 From 1b11593eb742ab6b69b1c23110fba024c5257101 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Sep 2017 18:36:44 -0400 Subject: fix a page leak in vhost_scsi_iov_to_sgl() error recovery commit 11d49e9d089ccec81be87c2386dfdd010d7f7f6e upstream. we are advancing sg as we go, so the pages we need to drop in case of error are *before* the current sg. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/scsi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index e4110d6de0b5..da6cc25baaef 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -703,6 +703,7 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write, struct scatterlist *sg, int sg_count) { size_t off = iter->iov_offset; + struct scatterlist *p = sg; int i, ret; for (i = 0; i < iter->nr_segs; i++) { @@ -711,8 +712,8 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write, ret = vhost_scsi_map_to_sgl(cmd, base, len, sg, write); if (ret < 0) { - for (i = 0; i < sg_count; i++) { - struct page *page = sg_page(&sg[i]); + while (p < sg) { + struct page *page = sg_page(p++); if (page) put_page(page); } -- cgit v1.2.3 From 8709c5386109557e6cb9e3c38f011ce627fe3ce8 Mon Sep 17 00:00:00 2001 From: Tuomas Tynkkynen Date: Wed, 6 Sep 2017 17:59:07 +0300 Subject: fs/9p: Compare qid.path in v9fs_test_inode commit 8ee031631546cf2f7859cc69593bd60bbdd70b46 upstream. Commit fd2421f54423 ("fs/9p: When doing inode lookup compare qid details and inode mode bits.") transformed v9fs_qid_iget() to use iget5_locked() instead of iget_locked(). However, the test() callback is not checking fid.path at all, which means that a lookup in the inode cache can now accidentally locate a completely wrong inode from the same inode hash bucket if the other fields (qid.type and qid.version) match. Fixes: fd2421f54423 ("fs/9p: When doing inode lookup compare qid details and inode mode bits.") Reviewed-by: Latchesar Ionkov Signed-off-by: Tuomas Tynkkynen Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_inode.c | 3 +++ fs/9p/vfs_inode_dotl.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 511078586fa1..73f1d1b3a51c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -483,6 +483,9 @@ static int v9fs_test_inode(struct inode *inode, void *data) if (v9inode->qid.type != st->qid.type) return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; return 1; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index cb899af1babc..0b88744c6446 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -87,6 +87,9 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) if (v9inode->qid.type != st->qid.type) return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; return 1; } -- cgit v1.2.3 From 63bfc4c90a878df5fcb4a80a06390393dbee1519 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 27 Oct 2017 20:52:56 -0700 Subject: iscsi-target: Fix non-immediate TMR reference leak commit 3fc9fb13a4b2576aeab86c62fd64eb29ab68659c upstream. This patch fixes a se_cmd->cmd_kref reference leak that can occur when a non immediate TMR is proceeded our of command sequence number order, and CMDSN_LOWER_THAN_EXP is returned by iscsit_sequence_cmd(). To address this bug, call target_put_sess_cmd() during this special case following what iscsit_process_scsi_cmd() does upon CMDSN_LOWER_THAN_EXP. Cc: Mike Christie Cc: Hannes Reinecke Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index fd493412b172..bb73401f5761 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -1923,12 +1923,14 @@ attach: if (!(hdr->opcode & ISCSI_OP_IMMEDIATE)) { int cmdsn_ret = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn); - if (cmdsn_ret == CMDSN_HIGHER_THAN_EXP) + if (cmdsn_ret == CMDSN_HIGHER_THAN_EXP) { out_of_order_cmdsn = 1; - else if (cmdsn_ret == CMDSN_LOWER_THAN_EXP) + } else if (cmdsn_ret == CMDSN_LOWER_THAN_EXP) { + target_put_sess_cmd(&cmd->se_cmd); return 0; - else if (cmdsn_ret == CMDSN_ERROR_CANNOT_RECOVER) + } else if (cmdsn_ret == CMDSN_ERROR_CANNOT_RECOVER) { return -1; + } } iscsit_ack_from_expstatsn(conn, be32_to_cpu(hdr->exp_statsn)); -- cgit v1.2.3 From 4e426ed57cd24b8d6621a4cf60d715d7a7da9a76 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Fri, 22 Sep 2017 16:48:28 -0700 Subject: target: Fix QUEUE_FULL + SCSI task attribute handling commit 1c79df1f349fb6050016cea4ef1dfbc3853a5685 upstream. This patch fixes a bug during QUEUE_FULL where transport_complete_qf() calls transport_complete_task_attr() after it's already been invoked by target_complete_ok_work() or transport_generic_request_failure() during initial completion, preceeding QUEUE_FULL. This will result in se_device->simple_cmds, se_device->dev_cur_ordered_id and/or se_device->dev_ordered_sync being updated multiple times for a single se_cmd. To address this bug, clear SCF_TASK_ATTR_SET after the first call to transport_complete_task_attr(), and avoid updating SCSI task attribute related counters for any subsequent calls. Also, when a se_cmd is deferred due to ordered tags and executed via target_restart_delayed_cmds(), set CMD_T_SENT before execution matching what target_execute_cmd() does. Cc: Michael Cyr Cc: Bryant G. Ly Cc: Mike Christie Cc: Hannes Reinecke Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_transport.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index a42054edd427..37abf881ca75 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1970,6 +1970,8 @@ static void target_restart_delayed_cmds(struct se_device *dev) list_del(&cmd->se_delayed_node); spin_unlock(&dev->delayed_cmd_lock); + cmd->transport_state |= CMD_T_SENT; + __target_execute_cmd(cmd, true); if (cmd->sam_task_attr == TCM_ORDERED_TAG) @@ -2007,6 +2009,8 @@ static void transport_complete_task_attr(struct se_cmd *cmd) pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED\n", dev->dev_cur_ordered_id); } + cmd->se_cmd_flags &= ~SCF_TASK_ATTR_SET; + restart: target_restart_delayed_cmds(dev); } -- cgit v1.2.3 From 8293dc75de475ae9b6b0071f8dcf3585cb8be42a Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Wed, 11 Oct 2017 16:54:42 +0200 Subject: KVM: nVMX: set IDTR and GDTR limits when loading L1 host state commit 21f2d551183847bc7fbe8d866151d00cdad18752 upstream. Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers: "The GDTR and IDTR limits are each set to FFFFH." Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9114588e3e61..67ba0d8f87c7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10394,6 +10394,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); + vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); + vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) -- cgit v1.2.3 From a694b1f85aeafb674ad3c47e2efa3c3de69abf5e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 26 Oct 2017 09:13:27 +0200 Subject: KVM: SVM: obey guest PAT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 15038e14724799b8c205beb5f20f9e54896013c3 upstream. For many years some users of assigned devices have reported worse performance on AMD processors with NPT than on AMD without NPT, Intel or bare metal. The reason turned out to be that SVM is discarding the guest PAT setting and uses the default (PA0=PA4=WB, PA1=PA5=WT, PA2=PA6=UC-, PA3=UC). The guest might be using a different setting, and especially might want write combining but isn't getting it (instead getting slow UC or UC- accesses). Thanks a lot to geoff@hostfission.com for noticing the relation to the g_pat setting. The patch has been tested also by a bunch of people on VFIO users forums. Fixes: 709ddebf81cb40e3c36c6109a7892e8b93a09464 Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196409 Signed-off-by: Paolo Bonzini Reviewed-by: David Hildenbrand Tested-by: Nick Sarnie Signed-off-by: Radim Krčmář Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/svm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 899c40f826dd..4e1b254c3695 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3114,6 +3114,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; switch (ecx) { + case MSR_IA32_CR_PAT: + if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + return 1; + vcpu->arch.pat = data; + svm->vmcb->save.g_pat = data; + mark_dirty(svm->vmcb, VMCB_NPT); + break; case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr); break; -- cgit v1.2.3 From 36d2f19430e2711a1b327ac1734de4a52c67abd2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Oct 2017 17:31:42 -0400 Subject: SUNRPC: Fix tracepoint storage issues with svc_recv and svc_rqst_status commit e9d4bf219c83d09579bc62512fea2ca10f025d93 upstream. There is no guarantee that either the request or the svc_xprt exist by the time we get round to printing the trace message. Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- include/trace/events/sunrpc.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5664ca07c9c7..a01a076ea060 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -455,20 +455,22 @@ TRACE_EVENT(svc_recv, TP_ARGS(rqst, status), TP_STRUCT__entry( - __field(struct sockaddr *, addr) __field(__be32, xid) __field(int, status) __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, rqst->rq_addrlen) ), TP_fast_assign( - __entry->addr = (struct sockaddr *)&rqst->rq_addr; __entry->xid = status > 0 ? rqst->rq_xid : 0; __entry->status = status; __entry->flags = rqst->rq_flags; + memcpy(__get_dynamic_array(addr), + &rqst->rq_addr, rqst->rq_addrlen); ), - TP_printk("addr=%pIScp xid=0x%x status=%d flags=%s", __entry->addr, + TP_printk("addr=%pIScp xid=0x%x status=%d flags=%s", + (struct sockaddr *)__get_dynamic_array(addr), be32_to_cpu(__entry->xid), __entry->status, show_rqstp_flags(__entry->flags)) ); @@ -480,22 +482,23 @@ DECLARE_EVENT_CLASS(svc_rqst_status, TP_ARGS(rqst, status), TP_STRUCT__entry( - __field(struct sockaddr *, addr) __field(__be32, xid) - __field(int, dropme) __field(int, status) __field(unsigned long, flags) + __dynamic_array(unsigned char, addr, rqst->rq_addrlen) ), TP_fast_assign( - __entry->addr = (struct sockaddr *)&rqst->rq_addr; __entry->xid = rqst->rq_xid; __entry->status = status; __entry->flags = rqst->rq_flags; + memcpy(__get_dynamic_array(addr), + &rqst->rq_addr, rqst->rq_addrlen); ), TP_printk("addr=%pIScp rq_xid=0x%x status=%d flags=%s", - __entry->addr, be32_to_cpu(__entry->xid), + (struct sockaddr *)__get_dynamic_array(addr), + be32_to_cpu(__entry->xid), __entry->status, show_rqstp_flags(__entry->flags)) ); -- cgit v1.2.3 From eca460c485f93de1474352d1d509e1a3e9f1d407 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Fri, 11 Mar 2016 16:13:32 +0200 Subject: clk: ti: dra7-atl-clock: Fix of_node reference counting commit 660e1551939931657808d47838a3f443c0e83fd0 upstream. of_find_node_by_name() will call of_node_put() on the node so we need to get it first to avoid warnings. The cfg_node needs to be put after we have finished processing the properties. Signed-off-by: Peter Ujfalusi Tested-by: Nishanth Menon Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- drivers/clk/ti/clk-dra7-atl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clk/ti/clk-dra7-atl.c b/drivers/clk/ti/clk-dra7-atl.c index 2e14dfb588f4..c77333230bdf 100644 --- a/drivers/clk/ti/clk-dra7-atl.c +++ b/drivers/clk/ti/clk-dra7-atl.c @@ -265,6 +265,7 @@ static int of_dra7_atl_clk_probe(struct platform_device *pdev) /* Get configuration for the ATL instances */ snprintf(prop, sizeof(prop), "atl%u", i); + of_node_get(node); cfg_node = of_find_node_by_name(node, prop); if (cfg_node) { ret = of_property_read_u32(cfg_node, "bws", @@ -278,6 +279,7 @@ static int of_dra7_atl_clk_probe(struct platform_device *pdev) atl_write(cinfo, DRA7_ATL_AWSMUX_REG(i), cdesc->aws); } + of_node_put(cfg_node); } cdesc->probed = true; -- cgit v1.2.3 From 037646313522559a65e91952c2e1ec623abf5147 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Sat, 11 Nov 2017 17:29:29 +0100 Subject: clk: ti: dra7-atl-clock: fix child-node lookups commit 33ec6dbc5a02677509d97fe36cd2105753f0f0ea upstream. Fix child node-lookup during probe, which ended up searching the whole device tree depth-first starting at parent rather than just matching on its children. Note that the original premature free of the parent node has already been fixed separately, but that fix was apparently never backported to stable. Fixes: 9ac33b0ce81f ("CLK: TI: Driver for DRA7 ATL (Audio Tracking Logic)") Fixes: 660e15519399 ("clk: ti: dra7-atl-clock: Fix of_node reference counting") Cc: Peter Ujfalusi Signed-off-by: Johan Hovold Acked-by: Peter Ujfalusi Signed-off-by: Stephen Boyd Signed-off-by: Greg Kroah-Hartman --- drivers/clk/ti/clk-dra7-atl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/clk/ti/clk-dra7-atl.c b/drivers/clk/ti/clk-dra7-atl.c index c77333230bdf..7d060ffe8975 100644 --- a/drivers/clk/ti/clk-dra7-atl.c +++ b/drivers/clk/ti/clk-dra7-atl.c @@ -265,8 +265,7 @@ static int of_dra7_atl_clk_probe(struct platform_device *pdev) /* Get configuration for the ATL instances */ snprintf(prop, sizeof(prop), "atl%u", i); - of_node_get(node); - cfg_node = of_find_node_by_name(node, prop); + cfg_node = of_get_child_by_name(node, prop); if (cfg_node) { ret = of_property_read_u32(cfg_node, "bws", &cdesc->bws); -- cgit v1.2.3 From 4dae2f771fa72083cf247ea44bcdb7096409527d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 26 Sep 2017 11:41:28 -0700 Subject: libnvdimm, namespace: fix label initialization to use valid seq numbers commit b18d4b8a25af6fe83d7692191d6ff962ea611c4f upstream. The set of valid sequence numbers is {1,2,3}. The specification indicates that an implementation should consider 0 a sign of a critical error: UEFI 2.7: 13.19 NVDIMM Label Protocol Software never writes the sequence number 00, so a correctly check-summed Index Block with this sequence number probably indicates a critical error. When software discovers this case it treats it as an invalid Index Block indication. While the expectation is that the invalid block is just thrown away, the Robustness Principle says we should fix this to make both sequence numbers valid. Fixes: f524bf271a5c ("libnvdimm: write pmem label set") Reported-by: Juston Li Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- drivers/nvdimm/label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 96526dcfdd37..ff7b9632ad61 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -823,7 +823,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels) nsindex = to_namespace_index(ndd, 0); memset(nsindex, 0, ndd->nsarea.config_size); for (i = 0; i < 2; i++) { - int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT); + int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT); if (rc) return rc; -- cgit v1.2.3 From 54a8d930b93f171acaffbac83a1d0f595fa277bd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 26 Sep 2017 11:21:24 -0700 Subject: libnvdimm, namespace: make 'resource' attribute only readable by root commit c1fb3542074fd0c4d901d778bd52455111e4eb6f upstream. For the same reason that /proc/iomem returns 0's for non-root readers and acpi tables are root-only, make the 'resource' attribute for namespace devices only readable by root. Otherwise we disclose physical address information. Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation") Reported-by: Dave Hansen Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- drivers/nvdimm/namespace_devs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index aae7379af4e4..c2184104b789 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1305,7 +1305,7 @@ static umode_t namespace_visible(struct kobject *kobj, if (a == &dev_attr_resource.attr) { if (is_namespace_blk(dev)) return 0; - return a->mode; + return 0400; } if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { -- cgit v1.2.3 From 3e32b40435b9d5e073c788dcfebb679c8a6ace1e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 11 Oct 2017 10:27:22 -0700 Subject: IB/srpt: Do not accept invalid initiator port names commit c70ca38960399a63d5c048b7b700612ea321d17e upstream. Make srpt_parse_i_port_id() return a negative value if hex2bin() fails. Fixes: commit a42d985bd5b2 ("ib_srpt: Initial SRP Target merge for v3.3-rc1") Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/srpt/ib_srpt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index eaabf3125846..c52131233ba7 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -3425,7 +3425,7 @@ static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name) { const char *p; unsigned len, count, leading_zero_bytes; - int ret, rc; + int ret; p = name; if (strncasecmp(p, "0x", 2) == 0) @@ -3437,10 +3437,9 @@ static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name) count = min(len / 2, 16U); leading_zero_bytes = 16 - count; memset(i_port_id, 0, leading_zero_bytes); - rc = hex2bin(i_port_id + leading_zero_bytes, p, count); - if (rc < 0) - pr_debug("hex2bin failed for srpt_parse_i_port_id: %d\n", rc); - ret = 0; + ret = hex2bin(i_port_id + leading_zero_bytes, p, count); + if (ret < 0) + pr_debug("hex2bin failed for srpt_parse_i_port_id: %d\n", ret); out: return ret; } -- cgit v1.2.3 From ecc5e8914766d6d2c14f06e5dac0bfd173505d85 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 11 Oct 2017 10:27:26 -0700 Subject: IB/srp: Avoid that a cable pull can trigger a kernel crash commit 8a0d18c62121d3c554a83eb96e2752861d84d937 upstream. This patch fixes the following kernel crash: general protection fault: 0000 [#1] PREEMPT SMP Workqueue: ib_mad2 timeout_sends [ib_core] Call Trace: ib_sa_path_rec_callback+0x1c4/0x1d0 [ib_core] send_handler+0xb2/0xd0 [ib_core] timeout_sends+0x14d/0x220 [ib_core] process_one_work+0x200/0x630 worker_thread+0x4e/0x3b0 kthread+0x113/0x150 Fixes: commit aef9ec39c47f ("IB: Add SCSI RDMA Protocol (SRP) initiator") Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/srp/ib_srp.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index e397f1b0af09..9a99cee2665a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -670,12 +670,19 @@ static void srp_path_rec_completion(int status, static int srp_lookup_path(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; - int ret; + int ret = -ENODEV; ch->path.numb_path = 1; init_completion(&ch->done); + /* + * Avoid that the SCSI host can be removed by srp_remove_target() + * before srp_path_rec_completion() is called. + */ + if (!scsi_host_get(target->scsi_host)) + goto out; + ch->path_query_id = ib_sa_path_rec_get(&srp_sa_client, target->srp_host->srp_dev->dev, target->srp_host->port, @@ -689,18 +696,24 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) GFP_KERNEL, srp_path_rec_completion, ch, &ch->path_query); - if (ch->path_query_id < 0) - return ch->path_query_id; + ret = ch->path_query_id; + if (ret < 0) + goto put; ret = wait_for_completion_interruptible(&ch->done); if (ret < 0) - return ret; + goto put; - if (ch->status < 0) + ret = ch->status; + if (ret < 0) shost_printk(KERN_WARNING, target->scsi_host, PFX "Path record query failed\n"); - return ch->status; +put: + scsi_host_put(target->scsi_host); + +out: + return ret; } static int srp_send_req(struct srp_rdma_ch *ch, bool multich) -- cgit v1.2.3 From 76c389ca1331aeec038d7a69cb1b8a0f3719dd72 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Sun, 9 Jul 2017 13:08:58 +0200 Subject: NFC: fix device-allocation error return commit c45e3e4c5b134b081e8af362109905427967eb19 upstream. A recent change fixing NFC device allocation itself introduced an error-handling bug by returning an error pointer in case device-id allocation failed. This is clearly broken as the callers still expected NULL to be returned on errors as detected by Dan's static checker. Fix this up by returning NULL in the event that we've run out of memory when allocating a new device id. Note that the offending commit is marked for stable (3.8) so this fix needs to be backported along with it. Fixes: 20777bc57c34 ("NFC: fix broken device allocation") Reported-by: Dan Carpenter Signed-off-by: Johan Hovold Signed-off-by: Samuel Ortiz Signed-off-by: Greg Kroah-Hartman --- net/nfc/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/nfc/core.c b/net/nfc/core.c index c5a2c7e733b3..1471e4b0aa2c 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -1093,7 +1093,7 @@ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, err_free_dev: kfree(dev); - return ERR_PTR(rc); + return NULL; } EXPORT_SYMBOL(nfc_allocate_device); -- cgit v1.2.3 From 146d533713859f7990703021e33ab63c8f7bcb1d Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 17 Nov 2017 11:05:44 -0600 Subject: i40e: Use smp_rmb rather than read_barrier_depends commit 52c6912fde0133981ee50ba08808f257829c4c93 upstream. The original issue being fixed in this patch was seen with the ixgbe driver, but the same issue exists with i40e as well, as the code is very similar. read_barrier_depends is not sufficient to ensure loads following it are not speculatively loaded out of order by the CPU, which can result in stale data being loaded, causing potential system crashes. Signed-off-by: Brian King Acked-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 4edbab6ca7ef..b5b228c9a030 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3595,7 +3595,7 @@ static bool i40e_clean_fdir_tx_irq(struct i40e_ring *tx_ring, int budget) break; /* prevent any other reads prior to eop_desc */ - read_barrier_depends(); + smp_rmb(); /* if the descriptor isn't done, no work yet to do */ if (!(eop_desc->cmd_type_offset_bsz & diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 26c55bba4bf3..6dcc3854844d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -663,7 +663,7 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) break; /* prevent any other reads prior to eop_desc */ - read_barrier_depends(); + smp_rmb(); /* we have caught up to head, no work left to do */ if (tx_head == tx_desc) -- cgit v1.2.3 From 391cdaaaa9d9ce6b4cdc659d94ae59f9384f5e37 Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 17 Nov 2017 11:05:47 -0600 Subject: igb: Use smp_rmb rather than read_barrier_depends commit c4cb99185b4cc96c0a1c70104dc21ae14d7e7f28 upstream. The original issue being fixed in this patch was seen with the ixgbe driver, but the same issue exists with igb as well, as the code is very similar. read_barrier_depends is not sufficient to ensure loads following it are not speculatively loaded out of order by the CPU, which can result in stale data being loaded, causing potential system crashes. Signed-off-by: Brian King Acked-by: Jesse Brandeburg Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index ff6e57d788eb..c55552c3d2f9 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6433,7 +6433,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector) break; /* prevent any other reads prior to eop_desc */ - read_barrier_depends(); + smp_rmb(); /* if DD is not set pending work has not been completed */ if (!(eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD))) -- cgit v1.2.3 From 2f7de4d5f525fc39c83e6c423e12f1954106bf38 Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 17 Nov 2017 11:05:46 -0600 Subject: igbvf: Use smp_rmb rather than read_barrier_depends commit 1e1f9ca546556e508d021545861f6b5fc75a95fe upstream. The original issue being fixed in this patch was seen with the ixgbe driver, but the same issue exists with igbvf as well, as the code is very similar. read_barrier_depends is not sufficient to ensure loads following it are not speculatively loaded out of order by the CPU, which can result in stale data being loaded, causing potential system crashes. Signed-off-by: Brian King Acked-by: Jesse Brandeburg Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igbvf/netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c index 297af801f051..519b72c41888 100644 --- a/drivers/net/ethernet/intel/igbvf/netdev.c +++ b/drivers/net/ethernet/intel/igbvf/netdev.c @@ -809,7 +809,7 @@ static bool igbvf_clean_tx_irq(struct igbvf_ring *tx_ring) break; /* prevent any other reads prior to eop_desc */ - read_barrier_depends(); + smp_rmb(); /* if DD is not set pending work has not been completed */ if (!(eop_desc->wb.status & cpu_to_le32(E1000_TXD_STAT_DD))) -- cgit v1.2.3 From a8e699dfca04191e209a63d484e347aa921422e2 Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 17 Nov 2017 11:05:45 -0600 Subject: ixgbevf: Use smp_rmb rather than read_barrier_depends commit ae0c585d93dfaf923d2c7eb44b2c3ab92854ea9b upstream. The original issue being fixed in this patch was seen with the ixgbe driver, but the same issue exists with ixgbevf as well, as the code is very similar. read_barrier_depends is not sufficient to ensure loads following it are not speculatively loaded out of order by the CPU, which can result in stale data being loaded, causing potential system crashes. Signed-off-by: Brian King Acked-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 592ff237d692..50bbad37d640 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -312,7 +312,7 @@ static bool ixgbevf_clean_tx_irq(struct ixgbevf_q_vector *q_vector, break; /* prevent any other reads prior to eop_desc */ - read_barrier_depends(); + smp_rmb(); /* if DD is not set pending work has not been completed */ if (!(eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD))) -- cgit v1.2.3 From b4ca98b7a8030a154478cfdf48fadb6ddbb2b04b Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 17 Nov 2017 11:05:49 -0600 Subject: i40evf: Use smp_rmb rather than read_barrier_depends commit f72271e2a0ae4277d53c4053f5eed8bb346ba38a upstream. The original issue being fixed in this patch was seen with the ixgbe driver, but the same issue exists with i40evf as well, as the code is very similar. read_barrier_depends is not sufficient to ensure loads following it are not speculatively loaded out of order by the CPU, which can result in stale data being loaded, causing potential system crashes. Signed-off-by: Brian King Acked-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 39db70a597ed..1ed27fcd5031 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -172,7 +172,7 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) break; /* prevent any other reads prior to eop_desc */ - read_barrier_depends(); + smp_rmb(); /* we have caught up to head, no work left to do */ if (tx_head == tx_desc) -- cgit v1.2.3 From bb923a81c33e543ead08420ac03e7d704243d6b0 Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 17 Nov 2017 11:05:48 -0600 Subject: fm10k: Use smp_rmb rather than read_barrier_depends commit 7b8edcc685b5e2c3c37aa13dc50a88e84a5bfef8 upstream. The original issue being fixed in this patch was seen with the ixgbe driver, but the same issue exists with fm10k as well, as the code is very similar. read_barrier_depends is not sufficient to ensure loads following it are not speculatively loaded out of order by the CPU, which can result in stale data being loaded, causing potential system crashes. Signed-off-by: Brian King Acked-by: Jesse Brandeburg Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 09281558bfbc..c21fa56afd7c 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -1226,7 +1226,7 @@ static bool fm10k_clean_tx_irq(struct fm10k_q_vector *q_vector, break; /* prevent any other reads prior to eop_desc */ - read_barrier_depends(); + smp_rmb(); /* if DD is not set pending work has not been completed */ if (!(eop_desc->flags & FM10K_TXD_FLAG_DONE)) -- cgit v1.2.3 From aef7cdb6a6ffa07da7f713d9164d50f954df10b6 Mon Sep 17 00:00:00 2001 From: Brian King Date: Fri, 17 Nov 2017 11:05:43 -0600 Subject: ixgbe: Fix skb list corruption on Power systems commit 0a9a17e3bb4564caf4bfe2a6783ae1287667d188 upstream. This patch fixes an issue seen on Power systems with ixgbe which results in skb list corruption and an eventual kernel oops. The following is what was observed: CPU 1 CPU2 ============================ ============================ 1: ixgbe_xmit_frame_ring ixgbe_clean_tx_irq 2: first->skb = skb eop_desc = tx_buffer->next_to_watch 3: ixgbe_tx_map read_barrier_depends() 4: wmb check adapter written status bit 5: first->next_to_watch = tx_desc napi_consume_skb(tx_buffer->skb ..); 6: writel(i, tx_ring->tail); The read_barrier_depends is insufficient to ensure that tx_buffer->skb does not get loaded prior to tx_buffer->next_to_watch, which then results in loading a stale skb pointer. This patch replaces the read_barrier_depends with smp_rmb to ensure loads are ordered with respect to the load of tx_buffer->next_to_watch. Signed-off-by: Brian King Acked-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 83645d8503d4..a5b443171b8b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1114,7 +1114,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, break; /* prevent any other reads prior to eop_desc */ - read_barrier_depends(); + smp_rmb(); /* if DD is not set pending work has not been completed */ if (!(eop_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD))) -- cgit v1.2.3 From 937a91cd399220c11f513e899218b26b226ee334 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sat, 11 Nov 2017 17:11:16 -0500 Subject: parisc: Fix validity check of pointer size argument in new CAS implementation commit 05f016d2ca7a4fab99d5d5472168506ddf95e74f upstream. As noted by Christoph Biedl, passing a pointer size of 4 in the new CAS implementation causes a kernel crash. The attached patch corrects the off by one error in the argument validity check. In reviewing the code, I noticed that we only perform word operations with the pointer size argument. The subi instruction intentionally uses a word condition on 64-bit kernels. Nullification was used instead of a cmpib instruction as the branch should never be taken. The shlw pseudo-operation generates a depw,z instruction and it clears the target before doing a shift left word deposit. Thus, we don't need to clip the upper 32 bits of this argument on 64-bit kernels. Tested with a gcc testsuite run with a 64-bit kernel. The gcc atomic code in libgcc is the only direct user of the new CAS implementation that I am aware of. Signed-off-by: John David Anglin Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/syscall.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index c6b855f7892c..9f22195b90ed 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -688,15 +688,15 @@ cas_action: /* ELF32 Process entry path */ lws_compare_and_swap_2: #ifdef CONFIG_64BIT - /* Clip the input registers */ + /* Clip the input registers. We don't need to clip %r23 as we + only use it for word operations */ depdi 0, 31, 32, %r26 depdi 0, 31, 32, %r25 depdi 0, 31, 32, %r24 - depdi 0, 31, 32, %r23 #endif /* Check the validity of the size pointer */ - subi,>>= 4, %r23, %r0 + subi,>>= 3, %r23, %r0 b,n lws_exit_nosys /* Jump to the functions which will load the old and new values into -- cgit v1.2.3 From a2943ce580f57d75df763c37846e9e478861e0fd Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 31 Aug 2017 21:55:57 +0530 Subject: powerpc/signal: Properly handle return value from uprobe_deny_signal() commit 46725b17f1c6c815a41429259b3f070c01e71bc1 upstream. When a uprobe is installed on an instruction that we currently do not emulate, we copy the instruction into a xol buffer and single step that instruction. If that instruction generates a fault, we abort the single stepping before invoking the signal handler. Once the signal handler is done, the uprobe trap is hit again since the instruction is retried and the process repeats. We use uprobe_deny_signal() to detect if the xol instruction triggered a signal. If so, we clear TIF_SIGPENDING and set TIF_UPROBE so that the signal is not handled until after the single stepping is aborted. In this case, uprobe_deny_signal() returns true and get_signal() ends up returning 0. However, in do_signal(), we are not looking at the return value, but depending on ksig.sig for further action, all with an uninitialized ksig that is not touched in this scenario. Fix the same by initializing ksig.sig to 0. Fixes: 129b69df9c90 ("powerpc: Use get_signal() signal_setup_done()") Reported-by: Anton Blanchard Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index cf8c7e4e0b21..984a54c85952 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -102,7 +102,7 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, static void do_signal(struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); - struct ksignal ksig; + struct ksignal ksig = { .sig = 0 }; int ret; int is32 = is_32bit_task(); -- cgit v1.2.3 From 878c0f9a7c693b238b30a85f870e5334a66a3935 Mon Sep 17 00:00:00 2001 From: Michele Baldessari Date: Mon, 6 Nov 2017 08:50:22 -0500 Subject: media: Don't do DMA on stack for firmware upload in the AS102 driver commit b3120d2cc447ee77b9d69bf4ad7b452c9adb4d39 upstream. Firmware load on AS102 is using the stack which is not allowed any longer. We currently fail with: kernel: transfer buffer not dma capable kernel: ------------[ cut here ]------------ kernel: WARNING: CPU: 0 PID: 598 at drivers/usb/core/hcd.c:1595 usb_hcd_map_urb_for_dma+0x41d/0x620 kernel: Modules linked in: amd64_edac_mod(-) edac_mce_amd as102_fe dvb_as102(+) kvm_amd kvm snd_hda_codec_realtek dvb_core snd_hda_codec_generic snd_hda_codec_hdmi snd_hda_intel snd_hda_codec irqbypass crct10dif_pclmul crc32_pclmul snd_hda_core snd_hwdep snd_seq ghash_clmulni_intel sp5100_tco fam15h_power wmi k10temp i2c_piix4 snd_seq_device snd_pcm snd_timer parport_pc parport tpm_infineon snd tpm_tis soundcore tpm_tis_core tpm shpchp acpi_cpufreq xfs libcrc32c amdgpu amdkfd amd_iommu_v2 radeon hid_logitech_hidpp i2c_algo_bit drm_kms_helper crc32c_intel ttm drm r8169 mii hid_logitech_dj kernel: CPU: 0 PID: 598 Comm: systemd-udevd Not tainted 4.13.10-200.fc26.x86_64 #1 kernel: Hardware name: ASUS All Series/AM1I-A, BIOS 0505 03/13/2014 kernel: task: ffff979933b24c80 task.stack: ffffaf83413a4000 kernel: RIP: 0010:usb_hcd_map_urb_for_dma+0x41d/0x620 systemd-fsck[659]: /dev/sda2: clean, 49/128016 files, 268609/512000 blocks kernel: RSP: 0018:ffffaf83413a7728 EFLAGS: 00010282 systemd-udevd[604]: link_config: autonegotiation is unset or enabled, the speed and duplex are not writable. kernel: RAX: 000000000000001f RBX: ffff979930bce780 RCX: 0000000000000000 kernel: RDX: 0000000000000000 RSI: ffff97993ec0e118 RDI: ffff97993ec0e118 kernel: RBP: ffffaf83413a7768 R08: 000000000000039a R09: 0000000000000000 kernel: R10: 0000000000000001 R11: 00000000ffffffff R12: 00000000fffffff5 kernel: R13: 0000000001400000 R14: 0000000000000001 R15: ffff979930806800 kernel: FS: 00007effaca5c8c0(0000) GS:ffff97993ec00000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 00007effa9fca962 CR3: 0000000233089000 CR4: 00000000000406f0 kernel: Call Trace: kernel: usb_hcd_submit_urb+0x493/0xb40 kernel: ? page_cache_tree_insert+0x100/0x100 kernel: ? xfs_iunlock+0xd5/0x100 [xfs] kernel: ? xfs_file_buffered_aio_read+0x57/0xc0 [xfs] kernel: usb_submit_urb+0x22d/0x560 kernel: usb_start_wait_urb+0x6e/0x180 kernel: usb_bulk_msg+0xb8/0x160 kernel: as102_send_ep1+0x49/0xe0 [dvb_as102] kernel: ? devres_add+0x3f/0x50 kernel: as102_firmware_upload.isra.0+0x1dc/0x210 [dvb_as102] kernel: as102_fw_upload+0xb6/0x1f0 [dvb_as102] kernel: as102_dvb_register+0x2af/0x2d0 [dvb_as102] kernel: as102_usb_probe+0x1f3/0x260 [dvb_as102] kernel: usb_probe_interface+0x124/0x300 kernel: driver_probe_device+0x2ff/0x450 kernel: __driver_attach+0xa4/0xe0 kernel: ? driver_probe_device+0x450/0x450 kernel: bus_for_each_dev+0x6e/0xb0 kernel: driver_attach+0x1e/0x20 kernel: bus_add_driver+0x1c7/0x270 kernel: driver_register+0x60/0xe0 kernel: usb_register_driver+0x81/0x150 kernel: ? 0xffffffffc0807000 kernel: as102_usb_driver_init+0x1e/0x1000 [dvb_as102] kernel: do_one_initcall+0x50/0x190 kernel: ? __vunmap+0x81/0xb0 kernel: ? kfree+0x154/0x170 kernel: ? kmem_cache_alloc_trace+0x15f/0x1c0 kernel: ? do_init_module+0x27/0x1e9 kernel: do_init_module+0x5f/0x1e9 kernel: load_module+0x2602/0x2c30 kernel: SYSC_init_module+0x170/0x1a0 kernel: ? SYSC_init_module+0x170/0x1a0 kernel: SyS_init_module+0xe/0x10 kernel: do_syscall_64+0x67/0x140 kernel: entry_SYSCALL64_slow_path+0x25/0x25 kernel: RIP: 0033:0x7effab6cf3ea kernel: RSP: 002b:00007fff5cfcbbc8 EFLAGS: 00000246 ORIG_RAX: 00000000000000af kernel: RAX: ffffffffffffffda RBX: 00005569e0b83760 RCX: 00007effab6cf3ea kernel: RDX: 00007effac2099c5 RSI: 0000000000009a13 RDI: 00005569e0b98c50 kernel: RBP: 00007effac2099c5 R08: 00005569e0b83ed0 R09: 0000000000001d80 kernel: R10: 00007effab98db00 R11: 0000000000000246 R12: 00005569e0b98c50 kernel: R13: 00005569e0b81c60 R14: 0000000000020000 R15: 00005569dfadfdf7 kernel: Code: 48 39 c8 73 30 80 3d 59 60 9d 00 00 41 bc f5 ff ff ff 0f 85 26 ff ff ff 48 c7 c7 b8 6b d0 92 c6 05 3f 60 9d 00 01 e8 24 3d ad ff <0f> ff 8b 53 64 e9 09 ff ff ff 65 48 8b 0c 25 00 d3 00 00 48 8b kernel: ---[ end trace c4cae366180e70ec ]--- kernel: as10x_usb: error during firmware upload part1 Let's allocate the the structure dynamically so we can get the firmware loaded correctly: [ 14.243057] as10x_usb: firmware: as102_data1_st.hex loaded with success [ 14.500777] as10x_usb: firmware: as102_data2_st.hex loaded with success Signed-off-by: Michele Baldessari Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/as102/as102_fw.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/media/usb/as102/as102_fw.c b/drivers/media/usb/as102/as102_fw.c index 07d08c49f4d4..b2e16bb67572 100644 --- a/drivers/media/usb/as102/as102_fw.c +++ b/drivers/media/usb/as102/as102_fw.c @@ -101,18 +101,23 @@ static int as102_firmware_upload(struct as10x_bus_adapter_t *bus_adap, unsigned char *cmd, const struct firmware *firmware) { - struct as10x_fw_pkt_t fw_pkt; + struct as10x_fw_pkt_t *fw_pkt; int total_read_bytes = 0, errno = 0; unsigned char addr_has_changed = 0; + fw_pkt = kmalloc(sizeof(*fw_pkt), GFP_KERNEL); + if (!fw_pkt) + return -ENOMEM; + + for (total_read_bytes = 0; total_read_bytes < firmware->size; ) { int read_bytes = 0, data_len = 0; /* parse intel hex line */ read_bytes = parse_hex_line( (u8 *) (firmware->data + total_read_bytes), - fw_pkt.raw.address, - fw_pkt.raw.data, + fw_pkt->raw.address, + fw_pkt->raw.data, &data_len, &addr_has_changed); @@ -122,28 +127,28 @@ static int as102_firmware_upload(struct as10x_bus_adapter_t *bus_adap, /* detect the end of file */ total_read_bytes += read_bytes; if (total_read_bytes == firmware->size) { - fw_pkt.u.request[0] = 0x00; - fw_pkt.u.request[1] = 0x03; + fw_pkt->u.request[0] = 0x00; + fw_pkt->u.request[1] = 0x03; /* send EOF command */ errno = bus_adap->ops->upload_fw_pkt(bus_adap, (uint8_t *) - &fw_pkt, 2, 0); + fw_pkt, 2, 0); if (errno < 0) goto error; } else { if (!addr_has_changed) { /* prepare command to send */ - fw_pkt.u.request[0] = 0x00; - fw_pkt.u.request[1] = 0x01; + fw_pkt->u.request[0] = 0x00; + fw_pkt->u.request[1] = 0x01; - data_len += sizeof(fw_pkt.u.request); - data_len += sizeof(fw_pkt.raw.address); + data_len += sizeof(fw_pkt->u.request); + data_len += sizeof(fw_pkt->raw.address); /* send cmd to device */ errno = bus_adap->ops->upload_fw_pkt(bus_adap, (uint8_t *) - &fw_pkt, + fw_pkt, data_len, 0); if (errno < 0) @@ -152,6 +157,7 @@ static int as102_firmware_upload(struct as10x_bus_adapter_t *bus_adap, } } error: + kfree(fw_pkt); return (errno == 0) ? total_read_bytes : errno; } -- cgit v1.2.3 From d758f4d8bf20e74f2afc6dfd0120d332d0fb9217 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 8 Oct 2017 14:18:52 -0400 Subject: media: rc: check for integer overflow commit 3e45067f94bbd61dec0619b1c32744eb0de480c8 upstream. The ioctl LIRC_SET_REC_TIMEOUT would set a timeout of 704ns if called with a timeout of 4294968us. Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/ir-lirc-codec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index efc21b1da211..ca107033e429 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -286,11 +286,14 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, if (!dev->max_timeout) return -ENOSYS; + /* Check for multiply overflow */ + if (val > U32_MAX / 1000) + return -EINVAL; + tmp = val * 1000; - if (tmp < dev->min_timeout || - tmp > dev->max_timeout) - return -EINVAL; + if (tmp < dev->min_timeout || tmp > dev->max_timeout) + return -EINVAL; dev->timeout = tmp; break; -- cgit v1.2.3 From 0870fb4c3566088dc222e582e43edbc9ececbce4 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 21 Sep 2017 05:40:18 -0300 Subject: cx231xx-cards: fix NULL-deref on missing association descriptor commit 6c3b047fa2d2286d5e438bcb470c7b1a49f415f6 upstream. Make sure to check that we actually have an Interface Association Descriptor before dereferencing it during probe to avoid dereferencing a NULL-pointer. Fixes: e0d3bafd0258 ("V4L/DVB (10954): Add cx231xx USB driver") Reported-by: Andrey Konovalov Signed-off-by: Johan Hovold Tested-by: Andrey Konovalov Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/cx231xx/cx231xx-cards.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/usb/cx231xx/cx231xx-cards.c b/drivers/media/usb/cx231xx/cx231xx-cards.c index 2c5f76d588ac..04ae21278440 100644 --- a/drivers/media/usb/cx231xx/cx231xx-cards.c +++ b/drivers/media/usb/cx231xx/cx231xx-cards.c @@ -1672,7 +1672,7 @@ static int cx231xx_usb_probe(struct usb_interface *interface, nr = dev->devno; assoc_desc = udev->actconfig->intf_assoc[0]; - if (assoc_desc->bFirstInterface != ifnum) { + if (!assoc_desc || assoc_desc->bFirstInterface != ifnum) { dev_err(d, "Not found matching IAD interface\n"); retval = -ENODEV; goto err_if; -- cgit v1.2.3 From 5a11b8458b35da1ad797973aa3d81120dd42cce8 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Delgado Date: Tue, 17 Oct 2017 11:48:50 -0400 Subject: media: v4l2-ctrl: Fix flags field on Control events commit 9cac9d2fb2fe0e0cadacdb94415b3fe49e3f724f upstream. VIDIOC_DQEVENT and VIDIOC_QUERY_EXT_CTRL should give the same output for the control flags field. This patch creates a new function user_flags(), that calculates the user exported flags value (which is different than the kernel internal flags structure). This function is then used by all the code that exports the internal flags to userspace. Reported-by: Dimitrios Katsaros Signed-off-by: Ricardo Ribalda Delgado Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/v4l2-core/v4l2-ctrls.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 4a1d9fdd14bb..523758e71fe6 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -1200,6 +1200,16 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type, } EXPORT_SYMBOL(v4l2_ctrl_fill); +static u32 user_flags(const struct v4l2_ctrl *ctrl) +{ + u32 flags = ctrl->flags; + + if (ctrl->is_ptr) + flags |= V4L2_CTRL_FLAG_HAS_PAYLOAD; + + return flags; +} + static void fill_event(struct v4l2_event *ev, struct v4l2_ctrl *ctrl, u32 changes) { memset(ev->reserved, 0, sizeof(ev->reserved)); @@ -1207,7 +1217,7 @@ static void fill_event(struct v4l2_event *ev, struct v4l2_ctrl *ctrl, u32 change ev->id = ctrl->id; ev->u.ctrl.changes = changes; ev->u.ctrl.type = ctrl->type; - ev->u.ctrl.flags = ctrl->flags; + ev->u.ctrl.flags = user_flags(ctrl); if (ctrl->is_ptr) ev->u.ctrl.value64 = 0; else @@ -2536,10 +2546,8 @@ int v4l2_query_ext_ctrl(struct v4l2_ctrl_handler *hdl, struct v4l2_query_ext_ctr else qc->id = ctrl->id; strlcpy(qc->name, ctrl->name, sizeof(qc->name)); - qc->flags = ctrl->flags; + qc->flags = user_flags(ctrl); qc->type = ctrl->type; - if (ctrl->is_ptr) - qc->flags |= V4L2_CTRL_FLAG_HAS_PAYLOAD; qc->elem_size = ctrl->elem_size; qc->elems = ctrl->elems; qc->nr_of_dims = ctrl->nr_of_dims; -- cgit v1.2.3 From cb1831a83e54cd3269a2420fce81c4fd8ae6f667 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 6 Oct 2017 14:05:04 -0400 Subject: sched/rt: Simplify the IPI based RT balancing logic commit 4bdced5c9a2922521e325896a7bbbf0132c94e56 upstream. When a CPU lowers its priority (schedules out a high priority task for a lower priority one), a check is made to see if any other CPU has overloaded RT tasks (more than one). It checks the rto_mask to determine this and if so it will request to pull one of those tasks to itself if the non running RT task is of higher priority than the new priority of the next task to run on the current CPU. When we deal with large number of CPUs, the original pull logic suffered from large lock contention on a single CPU run queue, which caused a huge latency across all CPUs. This was caused by only having one CPU having overloaded RT tasks and a bunch of other CPUs lowering their priority. To solve this issue, commit: b6366f048e0c ("sched/rt: Use IPI to trigger RT task push migration instead of pulling") changed the way to request a pull. Instead of grabbing the lock of the overloaded CPU's runqueue, it simply sent an IPI to that CPU to do the work. Although the IPI logic worked very well in removing the large latency build up, it still could suffer from a large number of IPIs being sent to a single CPU. On a 80 CPU box, I measured over 200us of processing IPIs. Worse yet, when I tested this on a 120 CPU box, with a stress test that had lots of RT tasks scheduling on all CPUs, it actually triggered the hard lockup detector! One CPU had so many IPIs sent to it, and due to the restart mechanism that is triggered when the source run queue has a priority status change, the CPU spent minutes! processing the IPIs. Thinking about this further, I realized there's no reason for each run queue to send its own IPI. As all CPUs with overloaded tasks must be scanned regardless if there's one or many CPUs lowering their priority, because there's no current way to find the CPU with the highest priority task that can schedule to one of these CPUs, there really only needs to be one IPI being sent around at a time. This greatly simplifies the code! The new approach is to have each root domain have its own irq work, as the rto_mask is per root domain. The root domain has the following fields attached to it: rto_push_work - the irq work to process each CPU set in rto_mask rto_lock - the lock to protect some of the other rto fields rto_loop_start - an atomic that keeps contention down on rto_lock the first CPU scheduling in a lower priority task is the one to kick off the process. rto_loop_next - an atomic that gets incremented for each CPU that schedules in a lower priority task. rto_loop - a variable protected by rto_lock that is used to compare against rto_loop_next rto_cpu - The cpu to send the next IPI to, also protected by the rto_lock. When a CPU schedules in a lower priority task and wants to make sure overloaded CPUs know about it. It increments the rto_loop_next. Then it atomically sets rto_loop_start with a cmpxchg. If the old value is not "0", then it is done, as another CPU is kicking off the IPI loop. If the old value is "0", then it will take the rto_lock to synchronize with a possible IPI being sent around to the overloaded CPUs. If rto_cpu is greater than or equal to nr_cpu_ids, then there's either no IPI being sent around, or one is about to finish. Then rto_cpu is set to the first CPU in rto_mask and an IPI is sent to that CPU. If there's no CPUs set in rto_mask, then there's nothing to be done. When the CPU receives the IPI, it will first try to push any RT tasks that is queued on the CPU but can't run because a higher priority RT task is currently running on that CPU. Then it takes the rto_lock and looks for the next CPU in the rto_mask. If it finds one, it simply sends an IPI to that CPU and the process continues. If there's no more CPUs in the rto_mask, then rto_loop is compared with rto_loop_next. If they match, everything is done and the process is over. If they do not match, then a CPU scheduled in a lower priority task as the IPI was being passed around, and the process needs to start again. The first CPU in rto_mask is sent the IPI. This change removes this duplication of work in the IPI logic, and greatly lowers the latency caused by the IPIs. This removed the lockup happening on the 120 CPU machine. It also simplifies the code tremendously. What else could anyone ask for? Thanks to Peter Zijlstra for simplifying the rto_loop_start atomic logic and supplying me with the rto_start_trylock() and rto_start_unlock() helper functions. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Peter Zijlstra (Intel) Cc: Clark Williams Cc: Daniel Bristot de Oliveira Cc: John Kacur Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Scott Wood Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170424114732.1aac6dc4@gandalf.local.home Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/core.c | 6 ++ kernel/sched/rt.c | 235 +++++++++++++++++++++++++-------------------------- kernel/sched/sched.h | 24 ++++-- 3 files changed, 138 insertions(+), 127 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a99c32946e49..15874a85ebcf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5907,6 +5907,12 @@ static int init_rootdomain(struct root_domain *rd) if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; +#ifdef HAVE_RT_PUSH_IPI + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); +#endif + init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_dlo_mask; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 78ae5c1d9412..faa75afcb7fe 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -64,10 +64,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) -static void push_irq_work_func(struct irq_work *work); -#endif - void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -87,13 +83,6 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); - -#ifdef HAVE_RT_PUSH_IPI - rt_rq->push_flags = 0; - rt_rq->push_cpu = nr_cpu_ids; - raw_spin_lock_init(&rt_rq->push_lock); - init_irq_work(&rt_rq->push_work, push_irq_work_func); -#endif #endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1802,160 +1791,166 @@ static void push_rt_tasks(struct rq *rq) } #ifdef HAVE_RT_PUSH_IPI + /* - * The search for the next cpu always starts at rq->cpu and ends - * when we reach rq->cpu again. It will never return rq->cpu. - * This returns the next cpu to check, or nr_cpu_ids if the loop - * is complete. + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * All CPUs with overloaded RT tasks need to be notified as there is currently + * no way to know which of these CPUs have the highest priority task waiting + * to run. Instead of trying to take a spinlock on each of these CPUs, + * which has shown to cause large latency when done on machines with many + * CPUs, sending an IPI to the CPUs to have them push off the overloaded + * RT tasks waiting to run. + * + * Just sending an IPI to each of the CPUs is also an issue, as on large + * count CPU machines, this can cause an IPI storm on a CPU, especially + * if its the only CPU with multiple RT tasks queued, and a large number + * of CPUs scheduling a lower priority task at the same time. + * + * Each root domain has its own irq work function that can iterate over + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT + * tassk must be checked if there's one or many CPUs that are lowering + * their priority, there's a single irq work iterator that will try to + * push off RT tasks that are waiting to run. + * + * When a CPU schedules a lower priority task, it will kick off the + * irq work iterator that will jump to each CPU with overloaded RT tasks. + * As it only takes the first CPU that schedules a lower priority task + * to start the process, the rto_start variable is incremented and if + * the atomic result is one, then that CPU will try to take the rto_lock. + * This prevents high contention on the lock as the process handles all + * CPUs scheduling lower priority tasks. + * + * All CPUs that are scheduling a lower priority task will increment the + * rt_loop_next variable. This will make sure that the irq work iterator + * checks all RT overloaded CPUs whenever a CPU schedules a new lower + * priority task, even if the iterator is in the middle of a scan. Incrementing + * the rt_loop_next will cause the iterator to perform another scan. * - * rq->rt.push_cpu holds the last cpu returned by this function, - * or if this is the first instance, it must hold rq->cpu. */ static int rto_next_cpu(struct rq *rq) { - int prev_cpu = rq->rt.push_cpu; + struct root_domain *rd = rq->rd; + int next; int cpu; - cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); - /* - * If the previous cpu is less than the rq's CPU, then it already - * passed the end of the mask, and has started from the beginning. - * We end if the next CPU is greater or equal to rq's CPU. + * When starting the IPI RT pushing, the rto_cpu is set to -1, + * rt_next_cpu() will simply return the first CPU found in + * the rto_mask. + * + * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * will return the next CPU found in the rto_mask. + * + * If there are no more CPUs left in the rto_mask, then a check is made + * against rto_loop and rto_loop_next. rto_loop is only updated with + * the rto_lock held, but any CPU may increment the rto_loop_next + * without any locking. */ - if (prev_cpu < rq->cpu) { - if (cpu >= rq->cpu) - return nr_cpu_ids; + for (;;) { - } else if (cpu >= nr_cpu_ids) { - /* - * We passed the end of the mask, start at the beginning. - * If the result is greater or equal to the rq's CPU, then - * the loop is finished. - */ - cpu = cpumask_first(rq->rd->rto_mask); - if (cpu >= rq->cpu) - return nr_cpu_ids; - } - rq->rt.push_cpu = cpu; + /* When rto_cpu is -1 this acts like cpumask_first() */ + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); - /* Return cpu to let the caller know if the loop is finished or not */ - return cpu; -} + rd->rto_cpu = cpu; -static int find_next_push_cpu(struct rq *rq) -{ - struct rq *next_rq; - int cpu; + if (cpu < nr_cpu_ids) + return cpu; - while (1) { - cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - break; - next_rq = cpu_rq(cpu); + rd->rto_cpu = -1; - /* Make sure the next rq can push to this rq */ - if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + /* + * ACQUIRE ensures we see the @rto_mask changes + * made prior to the @next value observed. + * + * Matches WMB in rt_set_overload(). + */ + next = atomic_read_acquire(&rd->rto_loop_next); + + if (rd->rto_loop == next) break; + + rd->rto_loop = next; } - return cpu; + return -1; +} + +static inline bool rto_start_trylock(atomic_t *v) +{ + return !atomic_cmpxchg_acquire(v, 0, 1); } -#define RT_PUSH_IPI_EXECUTING 1 -#define RT_PUSH_IPI_RESTART 2 +static inline void rto_start_unlock(atomic_t *v) +{ + atomic_set_release(v, 0); +} static void tell_cpu_to_push(struct rq *rq) { - int cpu; + int cpu = -1; - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - raw_spin_lock(&rq->rt.push_lock); - /* Make sure it's still executing */ - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - /* - * Tell the IPI to restart the loop as things have - * changed since it started. - */ - rq->rt.push_flags |= RT_PUSH_IPI_RESTART; - raw_spin_unlock(&rq->rt.push_lock); - return; - } - raw_spin_unlock(&rq->rt.push_lock); - } + /* Keep the loop going if the IPI is currently active */ + atomic_inc(&rq->rd->rto_loop_next); - /* When here, there's no IPI going around */ - - rq->rt.push_cpu = rq->cpu; - cpu = find_next_push_cpu(rq); - if (cpu >= nr_cpu_ids) + /* Only one CPU can initiate a loop at a time */ + if (!rto_start_trylock(&rq->rd->rto_loop_start)) return; - rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + raw_spin_lock(&rq->rd->rto_lock); + + /* + * The rto_cpu is updated under the lock, if it has a valid cpu + * then the IPI is still running and will continue due to the + * update to loop_next, and nothing needs to be done here. + * Otherwise it is finishing up and an ipi needs to be sent. + */ + if (rq->rd->rto_cpu < 0) + cpu = rto_next_cpu(rq); + + raw_spin_unlock(&rq->rd->rto_lock); - irq_work_queue_on(&rq->rt.push_work, cpu); + rto_start_unlock(&rq->rd->rto_loop_start); + + if (cpu >= 0) + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } /* Called from hardirq context */ -static void try_to_push_tasks(void *arg) +void rto_push_irq_work_func(struct irq_work *work) { - struct rt_rq *rt_rq = arg; - struct rq *rq, *src_rq; - int this_cpu; + struct rq *rq; int cpu; - this_cpu = rt_rq->push_cpu; + rq = this_rq(); - /* Paranoid check */ - BUG_ON(this_cpu != smp_processor_id()); - - rq = cpu_rq(this_cpu); - src_rq = rq_of_rt_rq(rt_rq); - -again: + /* + * We do not need to grab the lock to check for has_pushable_tasks. + * When it gets updated, a check is made if a push is possible. + */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_task(rq); + push_rt_tasks(rq); raw_spin_unlock(&rq->lock); } - /* Pass the IPI to the next rt overloaded queue */ - raw_spin_lock(&rt_rq->push_lock); - /* - * If the source queue changed since the IPI went out, - * we need to restart the search from that CPU again. - */ - if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { - rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; - rt_rq->push_cpu = src_rq->cpu; - } + raw_spin_lock(&rq->rd->rto_lock); - cpu = find_next_push_cpu(src_rq); + /* Pass the IPI to the next rt overloaded queue */ + cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; - raw_spin_unlock(&rt_rq->push_lock); + raw_spin_unlock(&rq->rd->rto_lock); - if (cpu >= nr_cpu_ids) + if (cpu < 0) return; - /* - * It is possible that a restart caused this CPU to be - * chosen again. Don't bother with an IPI, just see if we - * have more to push. - */ - if (unlikely(cpu == rq->cpu)) - goto again; - /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rt_rq->push_work, cpu); -} - -static void push_irq_work_func(struct irq_work *work) -{ - struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); - - try_to_push_tasks(rt_rq); + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55d92a1ca070..448a8266ceea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -429,7 +429,7 @@ static inline int rt_bandwidth_enabled(void) } /* RT IPI pull logic requires IRQ_WORK */ -#ifdef CONFIG_IRQ_WORK +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) # define HAVE_RT_PUSH_IPI #endif @@ -450,12 +450,6 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; -#ifdef HAVE_RT_PUSH_IPI - int push_flags; - int push_cpu; - struct irq_work push_work; - raw_spinlock_t push_lock; -#endif #endif /* CONFIG_SMP */ int rt_queued; @@ -537,6 +531,19 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -547,6 +554,9 @@ struct root_domain { extern struct root_domain def_root_domain; +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif #endif /* CONFIG_SMP */ /* -- cgit v1.2.3 From 91bd72dd8c72c603132bbbfd348a4216b8c83f58 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 29 Oct 2017 06:30:19 -0400 Subject: fscrypt: lock mutex before checking for bounce page pool commit a0b3bc855374c50b5ea85273553485af48caf2f7 upstream. fscrypt_initialize(), which allocates the global bounce page pool when an encrypted file is first accessed, uses "double-checked locking" to try to avoid locking fscrypt_init_mutex. However, it doesn't use any memory barriers, so it's theoretically possible for a thread to observe a bounce page pool which has not been fully initialized. This is a classic bug with "double-checked locking". While "only a theoretical issue" in the latest kernel, in pre-4.8 kernels the pointer that was checked was not even the last to be initialized, so it was easily possible for a crash (NULL pointer dereference) to happen. This was changed only incidentally by the large refactor to use fs/crypto/. Solve both problems in a trivial way that can easily be backported: just always take the mutex. It's theoretically less efficient, but it shouldn't be noticeable in practice as the mutex is only acquired very briefly once per encrypted file. Later I'd like to make this use a helper macro like DO_ONCE(). However, DO_ONCE() runs in atomic context, so we'd need to add a new macro that allows blocking. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/crypto_key.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 9a1bc638abce..9308fe4b66e6 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -129,11 +129,9 @@ int ext4_get_encryption_info(struct inode *inode) if (ei->i_crypt_info) return 0; - if (!ext4_read_workqueue) { - res = ext4_init_crypto(); - if (res) - return res; - } + res = ext4_init_crypto(); + if (res) + return res; res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, -- cgit v1.2.3 From fd3c395d4d466a8d76c6afd01c9a0e718daf6ca4 Mon Sep 17 00:00:00 2001 From: Tuomas Tynkkynen Date: Wed, 6 Sep 2017 17:59:08 +0300 Subject: net/9p: Switch to wait_event_killable() commit 9523feac272ccad2ad8186ba4fcc89103754de52 upstream. Because userspace gets Very Unhappy when calls like stat() and execve() return -EINTR on 9p filesystem mounts. For instance, when bash is looking in PATH for things to execute and some SIGCHLD interrupts stat(), bash can throw a spurious 'command not found' since it doesn't retry the stat(). In practice, hitting the problem is rare and needs a really slow/bogged down 9p server. Signed-off-by: Tuomas Tynkkynen Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- net/9p/client.c | 3 +-- net/9p/trans_virtio.c | 13 ++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index f5feac4ff4ec..3ff26eb1ea20 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -749,8 +749,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) } again: /* Wait for the response */ - err = wait_event_interruptible(*req->wq, - req->status >= REQ_STATUS_RCVD); + err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD); /* * Make sure our req is coherent with regard to updates in other diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 6e70ddb158b4..2ddeecca5b12 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -290,8 +290,8 @@ req_retry: if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); - err = wait_event_interruptible(*chan->vc_wq, - chan->ring_bufs_avail); + err = wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) return err; @@ -331,7 +331,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, * Other zc request to finish here */ if (atomic_read(&vp_pinned) >= chan->p9_max_pages) { - err = wait_event_interruptible(vp_wq, + err = wait_event_killable(vp_wq, (atomic_read(&vp_pinned) < chan->p9_max_pages)); if (err == -ERESTARTSYS) return err; @@ -475,8 +475,8 @@ req_retry_pinned: if (err == -ENOSPC) { chan->ring_bufs_avail = 0; spin_unlock_irqrestore(&chan->lock, flags); - err = wait_event_interruptible(*chan->vc_wq, - chan->ring_bufs_avail); + err = wait_event_killable(*chan->vc_wq, + chan->ring_bufs_avail); if (err == -ERESTARTSYS) goto err_out; @@ -493,8 +493,7 @@ req_retry_pinned: virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); - err = wait_event_interruptible(*req->wq, - req->status >= REQ_STATUS_RCVD); + err = wait_event_killable(*req->wq, req->status >= REQ_STATUS_RCVD); /* * Non kernel buffers are pinned, unpin them */ -- cgit v1.2.3 From d6968bc56e4469324482b31fa3304fff0e196c46 Mon Sep 17 00:00:00 2001 From: Tobias Jordan Date: Wed, 4 Oct 2017 11:35:03 +0530 Subject: PM / OPP: Add missing of_node_put(np) commit 7978db344719dab1e56d05e6fc04aaaddcde0a5e upstream. The for_each_available_child_of_node() loop in _of_add_opp_table_v2() doesn't drop the reference to "np" on errors. Fix that. Fixes: 274659029c9d (PM / OPP: Add support to parse "operating-points-v2" bindings) Signed-off-by: Tobias Jordan [ VK: Improved commit log. ] Signed-off-by: Viresh Kumar Reviewed-by: Stephen Boyd Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/base/power/opp/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c index f8580900c273..db6e7e57081c 100644 --- a/drivers/base/power/opp/core.c +++ b/drivers/base/power/opp/core.c @@ -1205,6 +1205,7 @@ static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np) if (ret) { dev_err(dev, "%s: Failed to add OPP, %d\n", __func__, ret); + of_node_put(np); goto free_table; } } -- cgit v1.2.3 From fbb2d8000a8f2a257f33ecb2d1fc346dcfe2e09f Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Fri, 21 Jul 2017 11:36:23 -0700 Subject: e1000e: Fix error path in link detection commit c4c40e51f9c32c6dd8adf606624c930a1c4d9bbb upstream. In case of error from e1e_rphy(), the loop will exit early and "success" will be set to true erroneously. Signed-off-by: Benjamin Poirier Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/e1000e/phy.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c index de13aeacae97..8e674a0988b0 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.c +++ b/drivers/net/ethernet/intel/e1000e/phy.c @@ -1744,6 +1744,7 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations, s32 ret_val = 0; u16 i, phy_status; + *success = false; for (i = 0; i < iterations; i++) { /* Some PHYs require the MII_BMSR register to be read * twice due to the link bit being sticky. No harm doing @@ -1763,16 +1764,16 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations, ret_val = e1e_rphy(hw, MII_BMSR, &phy_status); if (ret_val) break; - if (phy_status & BMSR_LSTATUS) + if (phy_status & BMSR_LSTATUS) { + *success = true; break; + } if (usec_interval >= 1000) msleep(usec_interval / 1000); else udelay(usec_interval); } - *success = (i < iterations); - return ret_val; } -- cgit v1.2.3 From 5df4097cfc7049f64cc0896ede59e94087abaefe Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Fri, 21 Jul 2017 11:36:25 -0700 Subject: e1000e: Fix return value test commit d3509f8bc7b0560044c15f0e3ecfde1d9af757a6 upstream. All the helpers return -E1000_ERR_PHY. Signed-off-by: Benjamin Poirier Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/e1000e/netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 80ec587d510e..3ecbc00a633a 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5035,7 +5035,7 @@ static bool e1000e_has_link(struct e1000_adapter *adapter) break; } - if ((ret_val == E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) && + if ((ret_val == -E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) && (er32(CTRL) & E1000_PHY_CTRL_GBE_DISABLE)) { /* See e1000_kmrn_lock_loss_workaround_ich8lan() */ e_info("Gigabit has been disabled, downgrading speed\n"); -- cgit v1.2.3 From bbb1fc744894bdba5a94b46b3e178cfb2330831d Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Fri, 21 Jul 2017 11:36:26 -0700 Subject: e1000e: Separate signaling for link check/link up commit 19110cfbb34d4af0cdfe14cd243f3b09dc95b013 upstream. Lennart reported the following race condition: \ e1000_watchdog_task \ e1000e_has_link \ hw->mac.ops.check_for_link() === e1000e_check_for_copper_link /* link is up */ mac->get_link_status = false; /* interrupt */ \ e1000_msix_other hw->mac.get_link_status = true; link_active = !hw->mac.get_link_status /* link_active is false, wrongly */ This problem arises because the single flag get_link_status is used to signal two different states: link status needs checking and link status is down. Avoid the problem by using the return value of .check_for_link to signal the link status to e1000e_has_link(). Reported-by: Lennart Sorensen Signed-off-by: Benjamin Poirier Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/e1000e/mac.c | 11 ++++++++--- drivers/net/ethernet/intel/e1000e/netdev.c | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c index e59d7c283cd4..645ace74429e 100644 --- a/drivers/net/ethernet/intel/e1000e/mac.c +++ b/drivers/net/ethernet/intel/e1000e/mac.c @@ -410,6 +410,9 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw) * Checks to see of the link status of the hardware has changed. If a * change in link status has been detected, then we read the PHY registers * to get the current speed/duplex if link exists. + * + * Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link + * up). **/ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) { @@ -423,7 +426,7 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) * Change or Rx Sequence Error interrupt. */ if (!mac->get_link_status) - return 0; + return 1; /* First we want to see if the MII Status Register reports * link. If so, then we want to get the current speed/duplex @@ -461,10 +464,12 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) * different link partner. */ ret_val = e1000e_config_fc_after_link_up(hw); - if (ret_val) + if (ret_val) { e_dbg("Error configuring flow control\n"); + return ret_val; + } - return ret_val; + return 1; } /** diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 3ecbc00a633a..5205f1ebe381 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -5017,7 +5017,7 @@ static bool e1000e_has_link(struct e1000_adapter *adapter) case e1000_media_type_copper: if (hw->mac.get_link_status) { ret_val = hw->mac.ops.check_for_link(hw); - link_active = !hw->mac.get_link_status; + link_active = ret_val > 0; } else { link_active = true; } -- cgit v1.2.3 From 57d2ce1603101ce3f30d0ccdc35b98af08d2ed88 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Mon, 4 Jul 2016 17:04:37 -0700 Subject: RDS: RDMA: return appropriate error on rdma map failures [ Upstream commit 584a8279a44a800dea5a5c1e9d53a002e03016b4 ] The first message to a remote node should prompt a new connection even if it is RDMA operation. For RDMA operation the MR mapping can fail because connections is not yet up. Since the connection establishment is asynchronous, we make sure the map failure because of unavailable connection reach to the user by appropriate error code. Before returning to the user, lets trigger the connection so that its ready for the next retry. Signed-off-by: Santosh Shilimkar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/rds/send.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/rds/send.c b/net/rds/send.c index 6815f03324d7..1a3c6acdd3f8 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -959,6 +959,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_rdma_map(rs, rm, cmsg); if (!ret) *allocated_mr = 1; + else if (ret == -ENODEV) + /* Accommodate the get_mr() case which can fail + * if connection isn't established yet. + */ + ret = -EAGAIN; break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: @@ -1072,8 +1077,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); - if (ret) + if (ret) { + /* Trigger connection so that its ready for the next retry */ + if (ret == -EAGAIN) + rds_conn_connect_if_down(conn); goto out; + } if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", -- cgit v1.2.3 From 37a48e6d83f513c0edabe4f12a80cb9fc4b8286b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 2 Jan 2017 14:04:24 -0600 Subject: PCI: Apply _HPX settings only to relevant devices [ Upstream commit 977509f7c5c6fb992ffcdf4291051af343b91645 ] Previously we didn't check the type of device before trying to apply Type 1 (PCI-X) or Type 2 (PCIe) Setting Records from _HPX. We don't support PCI-X Setting Records, so this was harmless, but the warning was useless. We do support PCIe Setting Records, and we didn't check whether a device was PCIe before applying settings. I don't think anything bad happened on non-PCIe devices because pcie_capability_clear_and_set_word(), pcie_cap_has_lnkctl(), etc., would fail before doing any harm. But it's ugly to depend on those internals. Check the device type before attempting to apply Type 1 and Type 2 Setting Records (Type 0 records are applicable to PCI, PCI-X, and PCIe devices). A side benefit is that this prevents useless "not supported" warnings when a BIOS supplies a Type 1 (PCI-X) Setting Record and we try to apply it to every single device: pci 0000:00:00.0: PCI-X settings not supported After this patch, we'll get the warning only when a BIOS supplies a Type 1 record and we have a PCI-X device to which it should be applied. Link: https://bugzilla.kernel.org/show_bug.cgi?id=187731 Signed-off-by: Bjorn Helgaas Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/pci/probe.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index b83df942794f..193ac13de49b 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1414,8 +1414,16 @@ static void program_hpp_type0(struct pci_dev *dev, struct hpp_type0 *hpp) static void program_hpp_type1(struct pci_dev *dev, struct hpp_type1 *hpp) { - if (hpp) - dev_warn(&dev->dev, "PCI-X settings not supported\n"); + int pos; + + if (!hpp) + return; + + pos = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (!pos) + return; + + dev_warn(&dev->dev, "PCI-X settings not supported\n"); } static bool pcie_root_rcb_set(struct pci_dev *dev) @@ -1441,6 +1449,9 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp) if (!hpp) return; + if (!pci_is_pcie(dev)) + return; + if (hpp->revision > 1) { dev_warn(&dev->dev, "PCIe settings rev %d not supported\n", hpp->revision); -- cgit v1.2.3 From 778395506a7f5744dce0cc14876c2f45ac779dea Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Thu, 15 Dec 2016 22:03:36 +0800 Subject: dmaengine: zx: set DMA_CYCLIC cap_mask bit [ Upstream commit fc318d64f3d91e15babac00e08354b1beb650b57 ] The zx_dma driver supports cyclic transfer mode. Let's set DMA_CYCLIC cap_mask bit to make that clear, and avoid unnecessary failure when clients request channel via dma_request_chan_by_mask() with DMA_CYCLIC bit set in mask. Signed-off-by: Shawn Guo Reviewed-by: Jun Nie Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/dma/zx296702_dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/zx296702_dma.c b/drivers/dma/zx296702_dma.c index 245d759d5ffc..6059d81e701a 100644 --- a/drivers/dma/zx296702_dma.c +++ b/drivers/dma/zx296702_dma.c @@ -813,6 +813,7 @@ static int zx_dma_probe(struct platform_device *op) INIT_LIST_HEAD(&d->slave.channels); dma_cap_set(DMA_SLAVE, d->slave.cap_mask); dma_cap_set(DMA_MEMCPY, d->slave.cap_mask); + dma_cap_set(DMA_CYCLIC, d->slave.cap_mask); dma_cap_set(DMA_PRIVATE, d->slave.cap_mask); d->slave.dev = &op->dev; d->slave.device_free_chan_resources = zx_dma_free_chan_resources; -- cgit v1.2.3 From d7c3d5333e39a3089ad259222b767a965963a693 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 29 Dec 2016 15:39:37 -0800 Subject: net: Allow IP_MULTICAST_IF to set index to L3 slave [ Upstream commit 7bb387c5ab12aeac3d5eea28686489ff46b53ca9 ] IP_MULTICAST_IF fails if sk_bound_dev_if is already set and the new index does not match it. e.g., ntpd[15381]: setsockopt IP_MULTICAST_IF 192.168.1.23 fails: Invalid argument Relax the check in setsockopt to allow setting mc_index to an L3 slave if sk_bound_dev_if points to an L3 master. Make a similar change for IPv6. In this case change the device lookup to take the rcu_read_lock avoiding a refcnt. The rcu lock is also needed for the lookup of a potential L3 master device. This really only silences a setsockopt failure since uses of mc_index are secondary to sk_bound_dev_if if it is set. In both cases, if either index is an L3 slave or master, lookups are directed to the same FIB table so relaxing the check at setsockopt time causes no harm. Patch is based on a suggested change by Darwin for a problem noted in their code base. Suggested-by: Darwin Dingel Signed-off-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_sockglue.c | 7 ++++++- net/ipv6/ipv6_sockglue.c | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f300d1cbfa91..097a1243c16c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -808,6 +808,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, { struct ip_mreqn mreq; struct net_device *dev = NULL; + int midx; if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -852,11 +853,15 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EADDRNOTAVAIL; if (!dev) break; + + midx = l3mdev_master_ifindex(dev); + dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && - mreq.imr_ifindex != sk->sk_bound_dev_if) + mreq.imr_ifindex != sk->sk_bound_dev_if && + (!midx || midx != sk->sk_bound_dev_if)) break; inet->mc_index = mreq.imr_ifindex; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4449ad1f8114..a4a30d2ca66f 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -583,16 +583,24 @@ done: if (val) { struct net_device *dev; + int midx; - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) - goto e_inval; + rcu_read_lock(); - dev = dev_get_by_index(net, val); + dev = dev_get_by_index_rcu(net, val); if (!dev) { + rcu_read_unlock(); retv = -ENODEV; break; } - dev_put(dev); + midx = l3mdev_master_ifindex_rcu(dev); + + rcu_read_unlock(); + + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != val && + (!midx || midx != sk->sk_bound_dev_if)) + goto e_inval; } np->mcast_oif = val; retv = 0; -- cgit v1.2.3 From 609797646f9b1fbfa6817aec31ca1d0ea64efa19 Mon Sep 17 00:00:00 2001 From: Thomas Preisner Date: Fri, 30 Dec 2016 03:37:54 +0100 Subject: net: 3com: typhoon: typhoon_init_one: make return values more specific [ Upstream commit 6b6bbb5922a4b1d4b58125a572da91010295fba3 ] In some cases the return value of a failing function is not being used and the function typhoon_init_one() returns another negative error code instead. Signed-off-by: Thomas Preisner Signed-off-by: Milan Stephan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/3com/typhoon.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c index 8f8418d2ac4a..6d0f0eb2ee2f 100644 --- a/drivers/net/ethernet/3com/typhoon.c +++ b/drivers/net/ethernet/3com/typhoon.c @@ -2366,9 +2366,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) * 4) Get the hardware address. * 5) Put the card to sleep. */ - if (typhoon_reset(ioaddr, WaitSleep) < 0) { + err = typhoon_reset(ioaddr, WaitSleep); + if (err < 0) { err_msg = "could not reset 3XP"; - err = -EIO; goto error_out_dma; } @@ -2382,16 +2382,16 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) typhoon_init_interface(tp); typhoon_init_rings(tp); - if(typhoon_boot_3XP(tp, TYPHOON_STATUS_WAITING_FOR_HOST) < 0) { + err = typhoon_boot_3XP(tp, TYPHOON_STATUS_WAITING_FOR_HOST); + if (err < 0) { err_msg = "cannot boot 3XP sleep image"; - err = -EIO; goto error_out_reset; } INIT_COMMAND_WITH_RESPONSE(&xp_cmd, TYPHOON_CMD_READ_MAC_ADDRESS); - if(typhoon_issue_command(tp, 1, &xp_cmd, 1, xp_resp) < 0) { + err = typhoon_issue_command(tp, 1, &xp_cmd, 1, xp_resp); + if (err < 0) { err_msg = "cannot read MAC address"; - err = -EIO; goto error_out_reset; } @@ -2424,9 +2424,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if(xp_resp[0].numDesc != 0) tp->capabilities |= TYPHOON_WAKEUP_NEEDS_RESET; - if(typhoon_sleep(tp, PCI_D3hot, 0) < 0) { + err = typhoon_sleep(tp, PCI_D3hot, 0); + if (err < 0) { err_msg = "cannot put adapter to sleep"; - err = -EIO; goto error_out_reset; } -- cgit v1.2.3 From 7f69dc100400482425c108a0dc0102101e192a21 Mon Sep 17 00:00:00 2001 From: Thomas Preisner Date: Fri, 30 Dec 2016 03:37:53 +0100 Subject: net: 3com: typhoon: typhoon_init_one: fix incorrect return values [ Upstream commit 107fded7bf616ad6f46823d98b8ed6405d7adf2d ] In a few cases the err-variable is not set to a negative error code if a function call in typhoon_init_one() fails and thus 0 is returned instead. It may be better to set err to the appropriate negative error code before returning. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188841 Reported-by: Pan Bian Signed-off-by: Thomas Preisner Signed-off-by: Milan Stephan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/3com/typhoon.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c index 6d0f0eb2ee2f..a0012c3cb4f6 100644 --- a/drivers/net/ethernet/3com/typhoon.c +++ b/drivers/net/ethernet/3com/typhoon.c @@ -2398,8 +2398,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) *(__be16 *)&dev->dev_addr[0] = htons(le16_to_cpu(xp_resp[0].parm1)); *(__be32 *)&dev->dev_addr[2] = htonl(le32_to_cpu(xp_resp[0].parm2)); - if(!is_valid_ether_addr(dev->dev_addr)) { + if (!is_valid_ether_addr(dev->dev_addr)) { err_msg = "Could not obtain valid ethernet address, aborting"; + err = -EIO; goto error_out_reset; } @@ -2407,7 +2408,8 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) * later when we print out the version reported. */ INIT_COMMAND_WITH_RESPONSE(&xp_cmd, TYPHOON_CMD_READ_VERSIONS); - if(typhoon_issue_command(tp, 1, &xp_cmd, 3, xp_resp) < 0) { + err = typhoon_issue_command(tp, 1, &xp_cmd, 3, xp_resp); + if (err < 0) { err_msg = "Could not get Sleep Image version"; goto error_out_reset; } @@ -2449,7 +2451,8 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->features = dev->hw_features | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_RXCSUM; - if(register_netdev(dev) < 0) { + err = register_netdev(dev); + if (err < 0) { err_msg = "unable to register netdev"; goto error_out_reset; } -- cgit v1.2.3 From c4cf731af838706523c3746fafe49afaa323887c Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Fri, 30 Dec 2016 17:38:52 +0100 Subject: drm/armada: Fix compile fail [ Upstream commit 7357f89954b6d005df6ab8929759e78d7d9a80f9 ] I reported the include issue for tracepoints a while ago, but nothing seems to have happened. Now it bit us, since the drm_mm_print conversion was broken for armada. Fix it, so I can re-enable armada in the drm-misc build configs. v2: Rebase just the compile fix on top of Chris' build fix. Cc: Russell King Cc: Chris Wilson Acked: Chris Wilson Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1483115932-19584-1-git-send-email-daniel.vetter@ffwll.ch Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/armada/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/armada/Makefile b/drivers/gpu/drm/armada/Makefile index ffd673615772..26412d2f8c98 100644 --- a/drivers/gpu/drm/armada/Makefile +++ b/drivers/gpu/drm/armada/Makefile @@ -4,3 +4,5 @@ armada-y += armada_510.o armada-$(CONFIG_DEBUG_FS) += armada_debugfs.o obj-$(CONFIG_DRM_ARMADA) := armada.o + +CFLAGS_armada_trace.o := -I$(src) -- cgit v1.2.3 From 64b22ee723eed2e35265b9431db0e7587c53141f Mon Sep 17 00:00:00 2001 From: Ryan Hsu Date: Tue, 13 Dec 2016 14:55:19 -0800 Subject: ath10k: fix incorrect txpower set by P2P_DEVICE interface [ Upstream commit 88407beb1b1462f706a1950a355fd086e1c450b6 ] Ath10k reports the phy capability that supports P2P_DEVICE interface. When we use the P2P supported wpa_supplicant to start connection, it'll create two interfaces, one is wlan0 (vdev_id=0) and one is P2P_DEVICE p2p-dev-wlan0 which is for p2p control channel (vdev_id=1). ath10k_pci mac vdev create 0 (add interface) type 2 subtype 0 ath10k_add_interface: vdev_id: 0, txpower: 0, bss_power: 0 ... ath10k_pci mac vdev create 1 (add interface) type 2 subtype 1 ath10k_add_interface: vdev_id: 1, txpower: 0, bss_power: 0 And the txpower in per vif bss_conf will only be set to valid tx power when the interface is assigned with channel_ctx. But this P2P_DEVICE interface will never be used for any connection, so that the uninitialized bss_conf.txpower=0 is assinged to the arvif->txpower when interface created. Since the txpower configuration is firmware per physical interface. So the smallest txpower of all vifs will be the one limit the tx power of the physical device, that causing the low txpower issue on other active interfaces. wlan0: Limiting TX power to 21 (24 - 3) dBm ath10k_pci mac vdev_id 0 txpower 21 ath10k_mac_txpower_recalc: vdev_id: 1, txpower: 0 ath10k_mac_txpower_recalc: vdev_id: 0, txpower: 21 ath10k_pci mac txpower 0 This issue only happens when we use the wpa_supplicant that supports P2P or if we use the iw tool to create the control P2P_DEVICE interface. Signed-off-by: Ryan Hsu Signed-off-by: Kalle Valo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/mac.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 6decf4a95ce1..cf3baf3185e8 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -4180,7 +4180,8 @@ static int ath10k_mac_txpower_recalc(struct ath10k *ar) lockdep_assert_held(&ar->conf_mutex); list_for_each_entry(arvif, &ar->arvifs, list) { - WARN_ON(arvif->txpower < 0); + if (arvif->txpower <= 0) + continue; if (txpower == -1) txpower = arvif->txpower; @@ -4188,8 +4189,8 @@ static int ath10k_mac_txpower_recalc(struct ath10k *ar) txpower = min(txpower, arvif->txpower); } - if (WARN_ON(txpower == -1)) - return -EINVAL; + if (txpower == -1) + return 0; ret = ath10k_mac_txpower_setup(ar, txpower); if (ret) { -- cgit v1.2.3 From b24769300a00453fb4d12a319c70516fe030354e Mon Sep 17 00:00:00 2001 From: Ryan Hsu Date: Thu, 22 Dec 2016 15:02:37 -0800 Subject: ath10k: ignore configuring the incorrect board_id [ Upstream commit d2e202c06ca42d353d95df12437740921a6d05b5 ] With command to get board_id from otp, in the case of following boot get otp board id result 0x00000000 board_id 0 chip_id 0 boot using board name 'bus=pci,bmi-chip-id=0,bmi-board-id=0" ... failed to fetch board data for bus=pci,bmi-chip-id=0,bmi-board-id=0 from ath10k/QCA6174/hw3.0/board-2.bin The invalid board_id=0 will be used as index to search in the board-2.bin. Ignore the case with board_id=0, as it means the otp is not carrying the board id information. Signed-off-by: Ryan Hsu Signed-off-by: Kalle Valo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 05de75360fa4..ee638cb8b48f 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -548,8 +548,11 @@ static int ath10k_core_get_board_id_from_otp(struct ath10k *ar) "boot get otp board id result 0x%08x board_id %d chip_id %d\n", result, board_id, chip_id); - if ((result & ATH10K_BMI_BOARD_ID_STATUS_MASK) != 0) + if ((result & ATH10K_BMI_BOARD_ID_STATUS_MASK) != 0 || + (board_id == 0)) { + ath10k_warn(ar, "board id is not exist in otp, ignore it\n"); return -EOPNOTSUPP; + } ar->id.bmi_ids_valid = true; ar->id.bmi_board_id = board_id; -- cgit v1.2.3 From ef751ca54602d380873b33580eb1f1da19c6b592 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Thu, 29 Dec 2016 16:12:09 +0200 Subject: ath10k: fix potential memory leak in ath10k_wmi_tlv_op_pull_fw_stats() [ Upstream commit 097e46d2ae90265d1afe141ba6208ba598b79e01 ] ath10k_wmi_tlv_op_pull_fw_stats() uses tb = ath10k_wmi_tlv_parse_alloc(...) function, which allocates memory. If any of the three error-paths are taken, this tb needs to be freed. Signed-off-by: Christian Lamparter Signed-off-by: Kalle Valo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/wmi-tlv.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/wmi-tlv.c b/drivers/net/wireless/ath/ath10k/wmi-tlv.c index 6fbd17b69469..02eea3c3b5d3 100644 --- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c +++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c @@ -1105,8 +1105,10 @@ static int ath10k_wmi_tlv_op_pull_fw_stats(struct ath10k *ar, struct ath10k_fw_stats_pdev *dst; src = data; - if (data_len < sizeof(*src)) + if (data_len < sizeof(*src)) { + kfree(tb); return -EPROTO; + } data += sizeof(*src); data_len -= sizeof(*src); @@ -1126,8 +1128,10 @@ static int ath10k_wmi_tlv_op_pull_fw_stats(struct ath10k *ar, struct ath10k_fw_stats_vdev *dst; src = data; - if (data_len < sizeof(*src)) + if (data_len < sizeof(*src)) { + kfree(tb); return -EPROTO; + } data += sizeof(*src); data_len -= sizeof(*src); @@ -1145,8 +1149,10 @@ static int ath10k_wmi_tlv_op_pull_fw_stats(struct ath10k *ar, struct ath10k_fw_stats_peer *dst; src = data; - if (data_len < sizeof(*src)) + if (data_len < sizeof(*src)) { + kfree(tb); return -EPROTO; + } data += sizeof(*src); data_len -= sizeof(*src); -- cgit v1.2.3 From 7e920566c352c9434010cbaf6a5ccd82c3ce73cf Mon Sep 17 00:00:00 2001 From: Bartosz Markowski Date: Thu, 15 Dec 2016 11:23:24 +0200 Subject: ath10k: set CTS protection VDEV param only if VDEV is up [ Upstream commit 7cfe0455ee1218add152e986b89b4bb8dbeafcdd ] The cts protection vdev parameter, in new QCA9377 TF2.0 firmware, requires bss peer to be created for the STATION vdev type. bss peer is being allocated by the firmware after vdev_start/_up commands. mac80211 may call the cts protection setup at any time, so the we needs to track the situation and defer the cts configuration to prevent firmware asserts, like below: [00]: 0x05020001 0x000015B3 0x0099ACE2 0x00955B31 [04]: 0x0099ACE2 0x00060730 0x00000004 0x00000000 [08]: 0x0044C754 0x00412C10 0x00000000 0x00409C54 [12]: 0x00000009 0x00000000 0x00952F6C 0x00952F77 [16]: 0x00952CC4 0x00910712 0x00000000 0x00000000 [20]: 0x4099ACE2 0x0040E858 0x00421254 0x004127F4 [24]: 0x8099B9B2 0x0040E8B8 0x00000000 0xC099ACE2 [28]: 0x800B75CB 0x0040E8F8 0x00000007 0x00005008 [32]: 0x809B048A 0x0040E958 0x00000010 0x00433B10 [36]: 0x809AFBBC 0x0040E9A8 0x0042BB74 0x0042BBBC [40]: 0x8091D252 0x0040E9C8 0x0042BBBC 0x00000001 [44]: 0x809FFA45 0x0040EA78 0x0043D3E4 0x0042C2C8 [48]: 0x809FCEF4 0x0040EA98 0x0043D3E4 0x00000001 [52]: 0x80911210 0x0040EAE8 0x00000010 0x004041D0 [56]: 0x80911154 0x0040EB28 0x00400000 0x00000000 Signed-off-by: Bartosz Markowski Signed-off-by: Kalle Valo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/mac.c | 51 +++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index cf3baf3185e8..bed8d89fe3a0 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -1127,6 +1127,36 @@ static int ath10k_monitor_recalc(struct ath10k *ar) return ath10k_monitor_stop(ar); } +static bool ath10k_mac_can_set_cts_prot(struct ath10k_vif *arvif) +{ + struct ath10k *ar = arvif->ar; + + lockdep_assert_held(&ar->conf_mutex); + + if (!arvif->is_started) { + ath10k_dbg(ar, ATH10K_DBG_MAC, "defer cts setup, vdev is not ready yet\n"); + return false; + } + + return true; +} + +static int ath10k_mac_set_cts_prot(struct ath10k_vif *arvif) +{ + struct ath10k *ar = arvif->ar; + u32 vdev_param; + + lockdep_assert_held(&ar->conf_mutex); + + vdev_param = ar->wmi.vdev_param->protection_mode; + + ath10k_dbg(ar, ATH10K_DBG_MAC, "mac vdev %d cts_protection %d\n", + arvif->vdev_id, arvif->use_cts_prot); + + return ath10k_wmi_vdev_set_param(ar, arvif->vdev_id, vdev_param, + arvif->use_cts_prot ? 1 : 0); +} + static int ath10k_recalc_rtscts_prot(struct ath10k_vif *arvif) { struct ath10k *ar = arvif->ar; @@ -4788,20 +4818,18 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw, if (changed & BSS_CHANGED_ERP_CTS_PROT) { arvif->use_cts_prot = info->use_cts_prot; - ath10k_dbg(ar, ATH10K_DBG_MAC, "mac vdev %d cts_prot %d\n", - arvif->vdev_id, info->use_cts_prot); ret = ath10k_recalc_rtscts_prot(arvif); if (ret) ath10k_warn(ar, "failed to recalculate rts/cts prot for vdev %d: %d\n", arvif->vdev_id, ret); - vdev_param = ar->wmi.vdev_param->protection_mode; - ret = ath10k_wmi_vdev_set_param(ar, arvif->vdev_id, vdev_param, - info->use_cts_prot ? 1 : 0); - if (ret) - ath10k_warn(ar, "failed to set protection mode %d on vdev %i: %d\n", - info->use_cts_prot, arvif->vdev_id, ret); + if (ath10k_mac_can_set_cts_prot(arvif)) { + ret = ath10k_mac_set_cts_prot(arvif); + if (ret) + ath10k_warn(ar, "failed to set cts protection for vdev %d: %d\n", + arvif->vdev_id, ret); + } } if (changed & BSS_CHANGED_ERP_SLOT) { @@ -6713,6 +6741,13 @@ ath10k_mac_op_assign_vif_chanctx(struct ieee80211_hw *hw, arvif->is_up = true; } + if (ath10k_mac_can_set_cts_prot(arvif)) { + ret = ath10k_mac_set_cts_prot(arvif); + if (ret) + ath10k_warn(ar, "failed to set cts protection for vdev %d: %d\n", + arvif->vdev_id, ret); + } + mutex_unlock(&ar->conf_mutex); return 0; -- cgit v1.2.3 From 31447ebb1a863b611676e2787db7160e19bd7e3c Mon Sep 17 00:00:00 2001 From: Gabriele Mazzotta Date: Sat, 24 Dec 2016 19:50:00 +0100 Subject: ALSA: hda - Apply ALC269_FIXUP_NO_SHUTUP on HDA_FIXUP_ACT_PROBE [ Upstream commit 972aa2c708703c21f14eb958b37e82aae2530e44 ] Setting shutup when the action is HDA_FIXUP_ACT_PRE_PROBE might not have the desired effect since it could be overridden by another more generic shutup function. Prevent this by setting the more specific shutup function on HDA_FIXUP_ACT_PROBE. Signed-off-by: Gabriele Mazzotta Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 64085a512dbf..e5730a7d0480 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4404,7 +4404,7 @@ static void alc_no_shutup(struct hda_codec *codec) static void alc_fixup_no_shutup(struct hda_codec *codec, const struct hda_fixup *fix, int action) { - if (action == HDA_FIXUP_ACT_PRE_PROBE) { + if (action == HDA_FIXUP_ACT_PROBE) { struct alc_spec *spec = codec->spec; spec->shutup = alc_no_shutup; } -- cgit v1.2.3 From f0d8fb74479b25ad56d25530ea5efc2881b02ad9 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 22 Dec 2016 08:36:38 +0000 Subject: drm: Apply range restriction after color adjustment when allocation [ Upstream commit 3db93756b501e5f0a3951c79cfa9ed43c26d3455 ] mm->color_adjust() compares the hole with its neighbouring nodes. They only abutt before we restrict the hole, so we have to apply color_adjust before we apply the range restriction. Signed-off-by: Chris Wilson Reviewed-by: Joonas Lahtinen Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/20161222083641.2691-36-chris@chris-wilson.co.uk Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_mm.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index 04de6fd88f8c..521b5bddb99c 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -262,14 +262,12 @@ static void drm_mm_insert_helper_range(struct drm_mm_node *hole_node, BUG_ON(!hole_node->hole_follows || node->allocated); - if (adj_start < start) - adj_start = start; - if (adj_end > end) - adj_end = end; - if (mm->color_adjust) mm->color_adjust(hole_node, color, &adj_start, &adj_end); + adj_start = max(adj_start, start); + adj_end = min(adj_end, end); + if (flags & DRM_MM_CREATE_TOP) adj_start = adj_end - size; @@ -475,17 +473,15 @@ static struct drm_mm_node *drm_mm_search_free_in_range_generic(const struct drm_ flags & DRM_MM_SEARCH_BELOW) { u64 hole_size = adj_end - adj_start; - if (adj_start < start) - adj_start = start; - if (adj_end > end) - adj_end = end; - if (mm->color_adjust) { mm->color_adjust(entry, color, &adj_start, &adj_end); if (adj_end <= adj_start) continue; } + adj_start = max(adj_start, start); + adj_end = min(adj_end, end); + if (!check_free_hole(adj_start, adj_end, size, alignment)) continue; -- cgit v1.2.3 From 9825826ed7bd7ab22e835b88508f00057b2848c0 Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Thu, 8 Dec 2016 10:15:50 +0900 Subject: mac80211: Remove invalid flag operations in mesh TSF synchronization [ Upstream commit 76f43b4c0a9337af22827d78de4f2b8fd5328489 ] mesh_sync_offset_adjust_tbtt() implements Extensible synchronization framework ([1] 13.13.2 Extensible synchronization framework). It shall not operate the flag "TBTT Adjusting subfield" ([1] 8.4.2.100.8 Mesh Capability), since it is used only for MBCA ([1] 13.13.4 Mesh beacon collision avoidance, see 13.13.4.4.3 TBTT scanning and adjustment procedures for detail). So this patch remove the flag operations. [1] IEEE Std 802.11 2012 Signed-off-by: Masashi Honma [remove adjusting_tbtt entirely, since it's now unused] Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/mac80211/ieee80211_i.h | 1 - net/mac80211/mesh.c | 3 --- net/mac80211/mesh_sync.c | 11 ----------- 3 files changed, 15 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6837a46ca4a2..7b271f3ded6b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -682,7 +682,6 @@ struct ieee80211_if_mesh { const struct ieee80211_mesh_sync_ops *sync_ops; s64 sync_offset_clockdrift_max; spinlock_t sync_offset_lock; - bool adjusting_tbtt; /* mesh power save */ enum nl80211_mesh_power_mode nonpeer_pm; int ps_peers_light_sleep; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 9063e8e736ad..9e1ded80a992 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -295,8 +295,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */ *pos |= ifmsh->ps_peers_deep_sleep ? IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00; - *pos++ |= ifmsh->adjusting_tbtt ? - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00; *pos++ = 0x00; return 0; @@ -866,7 +864,6 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) ifmsh->mesh_cc_id = 0; /* Disabled */ /* register sync ops from extensible synchronization framework */ ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id); - ifmsh->adjusting_tbtt = false; ifmsh->sync_offset_clockdrift_max = 0; set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); ieee80211_mesh_root_setup(ifmsh); diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index 64bc22ad9496..16ed43fe4841 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -119,7 +119,6 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, */ if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) { - clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN); msync_dbg(sdata, "STA %pM : is adjusting TBTT\n", sta->sta.addr); goto no_sync; @@ -168,11 +167,9 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - u8 cap; WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET); WARN_ON(!rcu_read_lock_held()); - cap = beacon->meshconf->meshconf_cap; spin_lock_bh(&ifmsh->sync_offset_lock); @@ -186,21 +183,13 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n", ifmsh->sync_offset_clockdrift_max); set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags); - - ifmsh->adjusting_tbtt = true; } else { msync_dbg(sdata, "TBTT : max clockdrift=%lld; too small to adjust\n", (long long)ifmsh->sync_offset_clockdrift_max); ifmsh->sync_offset_clockdrift_max = 0; - - ifmsh->adjusting_tbtt = false; } spin_unlock_bh(&ifmsh->sync_offset_lock); - - beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ? - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap : - ~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap; } static const struct sync_method sync_methods[] = { -- cgit v1.2.3 From 7e927748d4db4d248e919e44f4dd4891ba6b4d42 Mon Sep 17 00:00:00 2001 From: Masashi Honma Date: Wed, 30 Nov 2016 09:06:04 +0900 Subject: mac80211: Suppress NEW_PEER_CANDIDATE event if no room [ Upstream commit 11197d006bcfabf0173a7820a163fcaac420d10e ] Previously, kernel sends NEW_PEER_CANDIDATE event to user land even if the found peer does not have any room to accept other peer. This causes continuous connection trials. Signed-off-by: Masashi Honma Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/mac80211/mesh_plink.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index bd3d55eb21d4..9f02e54ad2a5 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -495,12 +495,14 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, /* Userspace handles station allocation */ if (sdata->u.mesh.user_mpm || - sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) - cfg80211_notify_new_peer_candidate(sdata->dev, addr, - elems->ie_start, - elems->total_len, - GFP_KERNEL); - else + sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) { + if (mesh_peer_accepts_plinks(elems) && + mesh_plink_availables(sdata)) + cfg80211_notify_new_peer_candidate(sdata->dev, addr, + elems->ie_start, + elems->total_len, + GFP_KERNEL); + } else sta = __mesh_sta_info_alloc(sdata, addr); return sta; -- cgit v1.2.3 From 266913b390c38f101971dfc6c1a5084e37fd41a7 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sat, 3 Dec 2016 17:24:17 +0800 Subject: iio: light: fix improper return value [ Upstream commit db4e5376d058af8924fafd0520a0942d92538d0e ] In function cm3232_reg_init(), it returns 0 even if the last call to i2c_smbus_write_byte_data() returns a negative value (indicates error). As a result, the return value may be inconsistent with the execution status, and the caller of cm3232_reg_init() will not be able to detect the error. This patch fixes the bug. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188641 Signed-off-by: Pan Bian Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iio/light/cm3232.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/light/cm3232.c b/drivers/iio/light/cm3232.c index fe89b6823217..263e97235ea0 100644 --- a/drivers/iio/light/cm3232.c +++ b/drivers/iio/light/cm3232.c @@ -119,7 +119,7 @@ static int cm3232_reg_init(struct cm3232_chip *chip) if (ret < 0) dev_err(&chip->client->dev, "Error writing reg_cmd\n"); - return 0; + return ret; } /** -- cgit v1.2.3 From 8537e96e4351dc80efe1b713c058da488b3afd29 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sat, 3 Dec 2016 21:44:30 +0800 Subject: staging: iio: cdc: fix improper return value [ Upstream commit 91ca1a8c584f55857b1f6ab20a1d3a1ce7a559bb ] At the end of function ad7150_write_event_config(), directly returns 0. As a result, the errors will be ignored by the callers. It may be better to return variable "ret". Signed-off-by: Pan Bian Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/staging/iio/cdc/ad7150.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/iio/cdc/ad7150.c b/drivers/staging/iio/cdc/ad7150.c index e8d0ff2d5c9b..808d6ebf6c94 100644 --- a/drivers/staging/iio/cdc/ad7150.c +++ b/drivers/staging/iio/cdc/ad7150.c @@ -272,7 +272,7 @@ static int ad7150_write_event_config(struct iio_dev *indio_dev, error_ret: mutex_unlock(&chip->state_lock); - return 0; + return ret; } static int ad7150_read_event_value(struct iio_dev *indio_dev, -- cgit v1.2.3 From 85f286d6f8cddb199d40fbf6a0dfba2eb4f94bd5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 14 Dec 2016 13:28:05 +0100 Subject: spi: SPI_FSL_DSPI should depend on HAS_DMA [ Upstream commit dadab2d4e3cf708ceba22ecddd94aedfecb39199 ] If NO_DMA=y: ERROR: "bad_dma_ops" [drivers/spi/spi-fsl-dspi.ko] undefined! Add a dependency on HAS_DMA to fix this. Signed-off-by: Geert Uytterhoeven Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/spi/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 8b9c2a38d1cc..b0a24dedd1ed 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -315,6 +315,7 @@ config SPI_FSL_SPI config SPI_FSL_DSPI tristate "Freescale DSPI controller" select REGMAP_MMIO + depends on HAS_DMA depends on SOC_VF610 || SOC_LS1021A || ARCH_LAYERSCAPE || COMPILE_TEST help This enables support for the Freescale DSPI controller in master -- cgit v1.2.3 From a1e4f6a151035a549d1192c03ff82c3d81da18ce Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 11 Dec 2016 20:46:51 +0100 Subject: netfilter: nft_queue: use raw_smp_processor_id() [ Upstream commit c2e756ff9e699865d294cdc112acfc36419cf5cc ] Using smp_processor_id() causes splats with PREEMPT_RCU: [19379.552780] BUG: using smp_processor_id() in preemptible [00000000] code: ping/32389 [19379.552793] caller is debug_smp_processor_id+0x17/0x19 [...] [19379.552823] Call Trace: [19379.552832] [] dump_stack+0x67/0x90 [19379.552837] [] check_preemption_disabled+0xe5/0xf5 [19379.552842] [] debug_smp_processor_id+0x17/0x19 [19379.552849] [] nft_queue_eval+0x35/0x20c [nft_queue] No need to disable preemption since we only fetch the numeric value, so let's use raw_smp_processor_id() instead. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nft_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 61d216eb7917..5d189c11d208 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -37,7 +37,7 @@ static void nft_queue_eval(const struct nft_expr *expr, if (priv->queues_total > 1) { if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) { - int cpu = smp_processor_id(); + int cpu = raw_smp_processor_id(); queue = priv->queuenum + cpu % priv->queues_total; } else { -- cgit v1.2.3 From 47b99a3306d08c03467b0efe13fc1bc1721afa15 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 13 Dec 2016 13:59:33 +0100 Subject: netfilter: nf_tables: fix oob access [ Upstream commit 3e38df136e453aa69eb4472108ebce2fb00b1ba6 ] BUG: KASAN: slab-out-of-bounds in nf_tables_rule_destroy+0xf1/0x130 at addr ffff88006a4c35c8 Read of size 8 by task nft/1607 When we've destroyed last valid expr, nft_expr_next() returns an invalid expr. We must not dereference it unless it passes != nft_expr_last() check. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2cb429d34c03..120e9ae04db3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1996,7 +1996,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, * is called on error from nf_tables_newrule(). */ expr = nft_expr_first(rule); - while (expr->ops && expr != nft_expr_last(rule)) { + while (expr != nft_expr_last(rule) && expr->ops) { nf_tables_expr_destroy(ctx, expr); expr = nft_expr_next(expr); } -- cgit v1.2.3 From e88f3fb0e34059c6d697802dc385c756129c17a1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 8 Dec 2016 13:05:43 +0000 Subject: ASoC: rsnd: don't double free kctrl [ Upstream commit 0ea617a298dcdc2251b4e10f83ac3f3e627b66e3 ] On an error, snd_ctl_add already free's kctrl, so calling snd_ctl_free_one to free it again leads to a double free error. Fix this by removing the extraneous snd_ctl_free_one call. Issue found using static analysis with CoverityScan, CID 1372908 Signed-off-by: Colin Ian King Acked-by: Kuninori Morimoto Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/soc/sh/rcar/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sound/soc/sh/rcar/core.c b/sound/soc/sh/rcar/core.c index 362446c36c9e..e00dfbec22c5 100644 --- a/sound/soc/sh/rcar/core.c +++ b/sound/soc/sh/rcar/core.c @@ -1049,10 +1049,8 @@ static int __rsnd_kctrl_new(struct rsnd_mod *mod, return -ENOMEM; ret = snd_ctl_add(card, kctrl); - if (ret < 0) { - snd_ctl_free_one(kctrl); + if (ret < 0) return ret; - } cfg->update = update; cfg->card = card; -- cgit v1.2.3 From b63209c78f9ec90b1cb43e458918b89f53e0db1b Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sun, 4 Dec 2016 12:51:53 +0800 Subject: btrfs: return the actual error value from from btrfs_uuid_tree_iterate [ Upstream commit 73ba39ab9307340dc98ec3622891314bbc09cc2e ] In function btrfs_uuid_tree_iterate(), errno is assigned to variable ret on errors. However, it directly returns 0. It may be better to return ret. This patch also removes the warning, because the caller already prints a warning. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188731 Signed-off-by: Pan Bian Reviewed-by: Omar Sandoval [ edited subject ] Signed-off-by: David Sterba Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/uuid-tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 778282944530..837a9a8d579e 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -348,7 +348,5 @@ skip: out: btrfs_free_path(path); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); - return 0; + return ret; } -- cgit v1.2.3 From 681b2239862dd88abf28b73e17e4f48af443f72e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 20 Dec 2016 10:29:12 +0000 Subject: ASoC: wm_adsp: Don't overrun firmware file buffer when reading region data [ Upstream commit 1cab2a84f470e15ecc8e5143bfe9398c6e888032 ] Protect against corrupt firmware files by ensuring that the length we get for the data in a region actually lies within the available firmware file data buffer. Signed-off-by: Richard Fitzgerald Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/soc/codecs/wm_adsp.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 0bb415a28723..f1f990b325ad 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -1060,7 +1060,7 @@ static int wm_adsp_load(struct wm_adsp *dsp) const struct wmfw_region *region; const struct wm_adsp_region *mem; const char *region_name; - char *file, *text; + char *file, *text = NULL; struct wm_adsp_buf *buf; unsigned int reg; int regions = 0; @@ -1221,10 +1221,21 @@ static int wm_adsp_load(struct wm_adsp *dsp) regions, le32_to_cpu(region->len), offset, region_name); + if ((pos + le32_to_cpu(region->len) + sizeof(*region)) > + firmware->size) { + adsp_err(dsp, + "%s.%d: %s region len %d bytes exceeds file length %zu\n", + file, regions, region_name, + le32_to_cpu(region->len), firmware->size); + ret = -EINVAL; + goto out_fw; + } + if (text) { memcpy(text, region->data, le32_to_cpu(region->len)); adsp_info(dsp, "%s: %s\n", file, text); kfree(text); + text = NULL; } if (reg) { @@ -1269,6 +1280,7 @@ out_fw: regmap_async_complete(regmap); wm_adsp_buf_free(&buf_list); release_firmware(firmware); + kfree(text); out: kfree(file); @@ -1730,6 +1742,17 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp) } if (reg) { + if ((pos + le32_to_cpu(blk->len) + sizeof(*blk)) > + firmware->size) { + adsp_err(dsp, + "%s.%d: %s region len %d bytes exceeds file length %zu\n", + file, blocks, region_name, + le32_to_cpu(blk->len), + firmware->size); + ret = -EINVAL; + goto out_fw; + } + buf = wm_adsp_buf_alloc(blk->data, le32_to_cpu(blk->len), &buf_list); -- cgit v1.2.3 From eedd29f51078395ce8db909cf1130bd468d70ec8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 20 Dec 2016 12:58:10 +0100 Subject: s390/kbuild: enable modversions for symbols exported from asm [ Upstream commit cabab3f9f5ca077535080b3252e6168935b914af ] s390 version of commit 334bb7738764 ("x86/kbuild: enable modversions for symbols exported from asm") so we get also rid of all these warnings: WARNING: EXPORT symbol "_mcount" [vmlinux] version generation failed, symbol will not be versioned. WARNING: EXPORT symbol "memcpy" [vmlinux] version generation failed, symbol will not be versioned. WARNING: EXPORT symbol "memmove" [vmlinux] version generation failed, symbol will not be versioned. WARNING: EXPORT symbol "memset" [vmlinux] version generation failed, symbol will not be versioned. WARNING: EXPORT symbol "save_fpu_regs" [vmlinux] version generation failed, symbol will not be versioned. WARNING: EXPORT symbol "sie64a" [vmlinux] version generation failed, symbol will not be versioned. WARNING: EXPORT symbol "sie_exit" [vmlinux] version generation failed, symbol will not be versioned. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/s390/include/asm/asm-prototypes.h | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 arch/s390/include/asm/asm-prototypes.h diff --git a/arch/s390/include/asm/asm-prototypes.h b/arch/s390/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..2c3413b0ca52 --- /dev/null +++ b/arch/s390/include/asm/asm-prototypes.h @@ -0,0 +1,8 @@ +#ifndef _ASM_S390_PROTOTYPES_H + +#include +#include +#include +#include + +#endif /* _ASM_S390_PROTOTYPES_H */ -- cgit v1.2.3 From c73eb1e0cc5675ebb584b3824b71f5c20ca33fc9 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 22 Dec 2016 08:19:46 +0100 Subject: xen: xenbus driver must not accept invalid transaction ids [ Upstream commit 639b08810d6ad74ded2c5f6e233c4fcb9d147168 ] When accessing Xenstore in a transaction the user is specifying a transaction id which he normally obtained from Xenstore when starting the transaction. Xenstore is validating a transaction id against all known transaction ids of the connection the request came in. As all requests of a domain not being the one where Xenstore lives share one connection, validation of transaction ids of different users of Xenstore in that domain should be done by the kernel of that domain being the multiplexer between the Xenstore users in that domain and Xenstore. In order to prohibit one Xenstore user "hijacking" a transaction from another user the xenbus driver has to verify a given transaction id against all known transaction ids of the user before forwarding it to Xenstore. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xenbus/xenbus_dev_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 0e0eb10f82a0..816a0e08ef10 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -316,7 +316,7 @@ static int xenbus_write_transaction(unsigned msg_type, rc = -ENOMEM; goto out; } - } else if (msg_type == XS_TRANSACTION_END) { + } else if (u->u.msg.tx_id != 0) { list_for_each_entry(trans, &u->transactions, list) if (trans->handle.id == u->u.msg.tx_id) break; -- cgit v1.2.3 From 3d7214a338d72a66739e83b32fe6e78688051ecc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 28 Nov 2017 12:29:45 +0100 Subject: Revert "sctp: do not peel off an assoc from one netns to another one" This reverts commit 2a0e60907e54dad75e9b3568d02bec11d6e74f6b which is commit df80cd9b28b9ebaa284a41df611dbf3a2d05ca74 upstream as I messed up by applying it to the tree twice. Reported-by: Michal Kubecek Cc: ChunYu Wang Cc: Xin Long Cc: Marcelo Ricardo Leitner Cc: Neil Horman Cc: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/socket.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 272edd7748a0..7f0f689b8d2b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4453,10 +4453,6 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) struct socket *sock; int err = 0; - /* Do not peel off from one netns to another one. */ - if (!net_eq(current->nsproxy->net_ns, sock_net(sk))) - return -EINVAL; - /* Do not peel off from one netns to another one. */ if (!net_eq(current->nsproxy->net_ns, sock_net(sk))) return -EINVAL; -- cgit v1.2.3 From 08c15ad2e6278a5fe1b209e8fcdbd2d235c48f34 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 30 Nov 2017 08:37:28 +0000 Subject: Linux 4.4.103 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9e036fac9c04..f5a51cd7ca49 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 102 +SUBLEVEL = 103 EXTRAVERSION = NAME = Blurry Fish Butt -- cgit v1.2.3