From 8b42fc47e1b64cb661fed8d96f874effbdf1d7f1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:20:25 -0700 Subject: mm: memcontrol: reclaim when shrinking memory.high below usage commit 588083bb37a3cea8533c392370a554417c8f29cb upstream. When setting memory.high below usage, nothing happens until the next charge comes along, and then it will only reclaim its own charge and not the now potentially huge excess of the new memory.high. This can cause groups to stay in excess of their memory.high indefinitely. To fix that, when shrinking memory.high, kick off a reclaim cycle that goes after the delta. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ee6acd279953..5d081ff28663 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5121,6 +5121,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nr_pages; unsigned long high; int err; @@ -5131,6 +5132,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + nr_pages = page_counter_read(&memcg->memory); + if (nr_pages > high) + try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + memcg_wb_domain_size_changed(memcg); return nbytes; } -- cgit v1.2.3 From 0ccab5b139971a2a3f48df24d1ee8be2dbf84042 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:20:28 -0700 Subject: mm: memcontrol: reclaim and OOM kill when shrinking memory.max below usage commit b6e6edcfa40561e9c8abe5eecf1c96f8e5fd9c6f upstream. Setting the original memory.limit_in_bytes hardlimit is subject to a race condition when the desired value is below the current usage. The code tries a few times to first reclaim and then see if the usage has dropped to where we would like it to be, but there is no locking, and the workload is free to continue making new charges up to the old limit. Thus, attempting to shrink a workload relies on pure luck and hope that the workload happens to cooperate. To fix this in the cgroup2 memory.max knob, do it the other way round: set the limit first, then try enforcement. And if reclaim is not able to succeed, trigger OOM kills in the group. Keep going until the new limit is met, we run out of OOM victims and there's only unreclaimable memory left, or the task writing to memory.max is killed. This allows users to shrink groups reliably, and the behavior is consistent with what happens when new charges are attempted in excess of memory.max. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5d081ff28663..fc0bcc41d57f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1332,7 +1332,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, +static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { struct oom_control oc = { @@ -1410,6 +1410,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } unlock: mutex_unlock(&oom_lock); + return chosen; } #if MAX_NUMNODES > 1 @@ -5158,6 +5159,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; + bool drained = false; unsigned long max; int err; @@ -5166,9 +5169,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - err = mem_cgroup_resize_limit(memcg, max); - if (err) - return err; + xchg(&memcg->memory.limit, max); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + + if (nr_pages <= max) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + if (nr_reclaims) { + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, + GFP_KERNEL, true)) + nr_reclaims--; + continue; + } + + mem_cgroup_events(memcg, MEMCG_OOM, 1); + if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) + break; + } memcg_wb_domain_size_changed(memcg); return nbytes; -- cgit v1.2.3 From 5dc7e939b6b8ca3d18d5814504954014578703e2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 25 Mar 2016 14:21:50 -0700 Subject: mm/page_alloc: prevent merging between isolated and other pageblocks commit d9dddbf556674bf125ecd925b24e43a5cf2a568a upstream. Hanjun Guo has reported that a CMA stress test causes broken accounting of CMA and free pages: > Before the test, I got: > -bash-4.3# cat /proc/meminfo | grep Cma > CmaTotal: 204800 kB > CmaFree: 195044 kB > > > After running the test: > -bash-4.3# cat /proc/meminfo | grep Cma > CmaTotal: 204800 kB > CmaFree: 6602584 kB > > So the freed CMA memory is more than total.. > > Also the the MemFree is more than mem total: > > -bash-4.3# cat /proc/meminfo > MemTotal: 16342016 kB > MemFree: 22367268 kB > MemAvailable: 22370528 kB Laura Abbott has confirmed the issue and suspected the freepage accounting rewrite around 3.18/4.0 by Joonsoo Kim. Joonsoo had a theory that this is caused by unexpected merging between MIGRATE_ISOLATE and MIGRATE_CMA pageblocks: > CMA isolates MAX_ORDER aligned blocks, but, during the process, > partialy isolated block exists. If MAX_ORDER is 11 and > pageblock_order is 9, two pageblocks make up MAX_ORDER > aligned block and I can think following scenario because pageblock > (un)isolation would be done one by one. > > (each character means one pageblock. 'C', 'I' means MIGRATE_CMA, > MIGRATE_ISOLATE, respectively. > > CC -> IC -> II (Isolation) > II -> CI -> CC (Un-isolation) > > If some pages are freed at this intermediate state such as IC or CI, > that page could be merged to the other page that is resident on > different type of pageblock and it will cause wrong freepage count. This was supposed to be prevented by CMA operating on MAX_ORDER blocks, but since it doesn't hold the zone->lock between pageblocks, a race window does exist. It's also likely that unexpected merging can occur between MIGRATE_ISOLATE and non-CMA pageblocks. This should be prevented in __free_one_page() since commit 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock"). However, we only check the migratetype of the pageblock where buddy merging has been initiated, not the migratetype of the buddy pageblock (or group of pageblocks) which can be MIGRATE_ISOLATE. Joonsoo has suggested checking for buddy migratetype as part of page_is_buddy(), but that would add extra checks in allocator hotpath and bloat-o-meter has shown significant code bloat (the function is inline). This patch reduces the bloat at some expense of more complicated code. The buddy-merging while-loop in __free_one_page() is initially bounded to pageblock_border and without any migratetype checks. The checks are placed outside, bumping the max_order if merging is allowed, and returning to the while-loop with a statement which can't be possibly considered harmful. This fixes the accounting bug and also removes the arguably weird state in the original commit 3c605096d315 where buddies could be left unmerged. Fixes: 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock") Link: https://lkml.org/lkml/2016/3/2/280 Signed-off-by: Vlastimil Babka Reported-by: Hanjun Guo Tested-by: Hanjun Guo Acked-by: Joonsoo Kim Debugged-by: Laura Abbott Debugged-by: Joonsoo Kim Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Minchan Kim Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d666df5ef95..c69531afbd8f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -662,34 +662,28 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - unsigned int max_order = MAX_ORDER; + unsigned int max_order; + + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (is_migrate_isolate(migratetype)) { - /* - * We restrict max order of merging to prevent merge - * between freepages on isolate pageblock and normal - * pageblock. Without this, pageblock isolation - * could cause incorrect freepage accounting. - */ - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); - } else { + if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - } - page_idx = pfn & ((1 << max_order) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); +continue_merging: while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) - break; + goto done_merging; /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@ -706,6 +700,32 @@ static inline void __free_one_page(struct page *page, page_idx = combined_idx; order++; } + if (max_order < MAX_ORDER) { + /* If we are here, it means order is >= pageblock_order. + * We want to prevent merge between freepages on isolate + * pageblock and normal pageblock. Without this, pageblock + * isolation could cause incorrect freepage or CMA accounting. + * + * We don't want to hit this code for the more frequent + * low-order merging. + */ + if (unlikely(has_isolate_pageblock(zone))) { + int buddy_mt; + + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (is_migrate_isolate(migratetype) || + is_migrate_isolate(buddy_mt))) + goto done_merging; + } + max_order++; + goto continue_merging; + } + +done_merging: set_page_order(page, order); /* -- cgit v1.2.3