From 756000bec7c1ef7677027d0d8060ee402b83b5d1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:57:45 -0700 Subject: mm/compaction: fix invalid free_pfn and compact_cached_free_pfn commit 623446e4dc45b37740268165107cc63abb3022f0 upstream. free_pfn and compact_cached_free_pfn are the pointer that remember restart position of freepage scanner. When they are reset or invalid, we set them to zone_end_pfn because freepage scanner works in reverse direction. But, because zone range is defined as [zone_start_pfn, zone_end_pfn), zone_end_pfn is invalid to access. Therefore, we should not store it to free_pfn and compact_cached_free_pfn. Instead, we need to store zone_end_pfn - 1 to them. There is one more thing we should consider. Freepage scanner scan reversely by pageblock unit. If free_pfn and compact_cached_free_pfn are set to middle of pageblock, it regards that sitiation as that it already scans front part of pageblock so we lose opportunity to scan there. To fix-up, this patch do round_down() to guarantee that reset position will be pageblock aligned. Note that thanks to the current pageblock_pfn_to_page() implementation, actual access to zone_end_pfn doesn't happen until now. But, following patch will change pageblock_pfn_to_page() so this patch is needed from now on. Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Aaron Lu Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index dba02dec7195..b6f98695d9e6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -200,7 +200,8 @@ static void reset_cached_positions(struct zone *zone) { zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + zone->compact_cached_free_pfn = + round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); } /* @@ -1358,11 +1359,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) */ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { - cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); zone->compact_cached_free_pfn = cc->free_pfn; } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; -- cgit v1.2.3 From 9780795fc1e5e3d6a98374df01d1f10372203fdf Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 15 Mar 2016 14:57:48 -0700 Subject: mm/compaction: pass only pageblock aligned range to pageblock_pfn_to_page commit e1409c325fdc1fef7b3d8025c51892355f065d15 upstream. pageblock_pfn_to_page() is used to check there is valid pfn and all pages in the pageblock is in a single zone. If there is a hole in the pageblock, passing arbitrary position to pageblock_pfn_to_page() could cause to skip whole pageblock scanning, instead of just skipping the hole page. For deterministic behaviour, it's better to always pass pageblock aligned range to pageblock_pfn_to_page(). It will also help further optimization on pageblock_pfn_to_page() in the following patch. Signed-off-by: Joonsoo Kim Cc: Aaron Lu Cc: David Rientjes Cc: Mel Gorman Cc: Rik van Riel Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/compaction.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index b6f98695d9e6..b6f145ed7ae1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -553,13 +553,17 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn; + unsigned long isolated, pfn, block_start_pfn, block_end_pfn; LIST_HEAD(freelist); pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn += isolated, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; @@ -572,11 +576,13 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); } - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, @@ -862,18 +868,23 @@ unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, block_end_pfn; + unsigned long pfn, block_start_pfn, block_end_pfn; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn = block_end_pfn, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { block_end_pfn = min(block_end_pfn, end_pfn); - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) continue; pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, @@ -1091,7 +1102,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1; static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { - unsigned long low_pfn, end_pfn; + unsigned long block_start_pfn; + unsigned long block_end_pfn; + unsigned long low_pfn; unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = @@ -1103,16 +1116,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; + block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < zone->zone_start_pfn) + block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); /* * Iterate over whole pageblocks until we find the first suitable. * Do not cross the free scanner. */ - for (; end_pfn <= cc->free_pfn; - low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + for (; block_end_pfn <= cc->free_pfn; + low_pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { /* * This can potentially iterate a massively long zone with @@ -1123,7 +1141,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, && compact_should_abort(cc)) break; - page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); if (!page) continue; @@ -1142,8 +1161,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ isolate_start_pfn = low_pfn; - low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, - isolate_mode); + low_pfn = isolate_migratepages_block(cc, low_pfn, + block_end_pfn, isolate_mode); if (!low_pfn || cc->contended) { acct_isolated(zone, cc); -- cgit v1.2.3 From 45c26b0736efad011b829d587fadf7085f948294 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 15 Mar 2016 14:55:27 -0700 Subject: mm/page-writeback: fix dirty_ratelimit calculation commit d59b1087a98e402ed9a7cc577f4da435f9a555f5 upstream. Calculation of dirty_ratelimit sometimes is not correct. E.g. initial values of dirty_ratelimit == INIT_BW and step == 0, lead to the following result: UBSAN: Undefined behaviour in ../mm/page-writeback.c:1286:7 shift exponent 25600 is too large for 64-bit type 'long unsigned int' The fix is straightforward - make step 0 if the shift exponent is too big. Signed-off-by: Andrey Ryabinin Cc: Wu Fengguang Cc: Tejun Heo Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- mm/page-writeback.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fd51ebfc423f..6d0dbde4503b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1162,6 +1162,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; + unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except @@ -1286,11 +1287,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ - step >>= dirty_ratelimit / (2 * step + 1); - /* - * Limit the tracking speed to avoid overshooting. - */ - step = (step + 7) / 8; + shift = dirty_ratelimit / (2 * step + 1); + if (shift < BITS_PER_LONG) + step = DIV_ROUND_UP(step >> shift, 8); + else + step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; -- cgit v1.2.3 From eeca23f21cfb78bf537e1575874862cee91f7688 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 20 May 2016 16:59:54 -0700 Subject: mm/zswap: use workqueue to destroy pool commit 200867af4dedfe7cb707f96773684de1d1fd21e6 upstream. Add a work_struct to struct zswap_pool, and change __zswap_pool_empty to use the workqueue instead of using call_rcu(). When zswap destroys a pool no longer in use, it uses call_rcu() to perform the destruction/freeing. Since that executes in softirq context, it must not sleep. However, actually destroying the pool involves freeing the per-cpu compressors (which requires locking the cpu_add_remove_lock mutex) and freeing the zpool, for which the implementation may sleep (e.g. zsmalloc calls kmem_cache_destroy, which locks the slab_mutex). So if either mutex is currently taken, or any other part of the compressor or zpool implementation sleeps, it will result in a BUG(). It's not easy to reproduce this when changing zswap's params normally. In testing with a loaded system, this does not fail: $ cd /sys/module/zswap/parameters $ echo lz4 > compressor ; echo zsmalloc > zpool nor does this: $ while true ; do > echo lzo > compressor ; echo zbud > zpool > sleep 1 > echo lz4 > compressor ; echo zsmalloc > zpool > sleep 1 > done although it's still possible either of those might fail, depending on whether anything else besides zswap has locked the mutexes. However, changing a parameter with no delay immediately causes the schedule while atomic BUG: $ while true ; do > echo lzo > compressor ; echo lz4 > compressor > done This is essentially the same as Yu Zhao's proposed patch to zsmalloc, but moved to zswap, to cover compressor and zpool freeing. Fixes: f1c54846ee45 ("zswap: dynamic pool creation") Signed-off-by: Dan Streetman Reported-by: Yu Zhao Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Vlastimil Babka Signed-off-by: Greg Kroah-Hartman --- mm/zswap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 45476f429789..70775d37747a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -123,7 +123,7 @@ struct zswap_pool { struct crypto_comp * __percpu *tfm; struct kref kref; struct list_head list; - struct rcu_head rcu_head; + struct work_struct work; struct notifier_block notifier; char tfm_name[CRYPTO_MAX_ALG_NAME]; }; @@ -667,9 +667,11 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool) return kref_get_unless_zero(&pool->kref); } -static void __zswap_pool_release(struct rcu_head *head) +static void __zswap_pool_release(struct work_struct *work) { - struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head); + struct zswap_pool *pool = container_of(work, typeof(*pool), work); + + synchronize_rcu(); /* nobody should have been able to get a kref... */ WARN_ON(kref_get_unless_zero(&pool->kref)); @@ -689,7 +691,9 @@ static void __zswap_pool_empty(struct kref *kref) WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); - call_rcu(&pool->rcu_head, __zswap_pool_release); + + INIT_WORK(&pool->work, __zswap_pool_release); + schedule_work(&pool->work); spin_unlock(&zswap_pools_lock); } -- cgit v1.2.3 From c4eafbc2ba115bf6a4dbbc12874e16aa7f0c7283 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Mon, 27 Feb 2017 14:26:53 -0800 Subject: zswap: don't param_set_charp while holding spinlock commit fd5bb66cd934987e49557455b6497fc006521940 upstream. Change the zpool/compressor param callback function to release the zswap_pools_lock spinlock before calling param_set_charp, since that function may sleep when it calls kmalloc with GFP_KERNEL. While this problem has existed for a while, I wasn't able to trigger it using a tight loop changing either/both the zpool and compressor params; I think it's very unlikely to be an issue on the stable kernels, especially since most zswap users will change the compressor and/or zpool from sysfs only one time each boot - or zero times, if they add the params to the kernel boot. Fixes: c99b42c3529e ("zswap: use charp for zswap param strings") Link: http://lkml.kernel.org/r/20170126155821.4545-1-ddstreet@ieee.org Signed-off-by: Dan Streetman Reported-by: Sergey Senozhatsky Cc: Michal Hocko Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Vlastimil Babka Signed-off-by: Greg Kroah-Hartman --- mm/zswap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 70775d37747a..568015e2fe7a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -752,18 +752,22 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, pool = zswap_pool_find_get(type, compressor); if (pool) { zswap_pool_debug("using existing", pool); + WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); - } else { - spin_unlock(&zswap_pools_lock); - pool = zswap_pool_create(type, compressor); - spin_lock(&zswap_pools_lock); } + spin_unlock(&zswap_pools_lock); + + if (!pool) + pool = zswap_pool_create(type, compressor); + if (pool) ret = param_set_charp(s, kp); else ret = -EINVAL; + spin_lock(&zswap_pools_lock); + if (!ret) { put_pool = zswap_pool_current(); list_add_rcu(&pool->list, &zswap_pools); -- cgit v1.2.3