diff options
Diffstat (limited to 'mm')
48 files changed, 5244 insertions, 1425 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 97a4e06b15c0..7077376523ed 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,7 +187,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) + depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390 || ARM64) config MEMORY_HOTPLUG_SPARSE def_bool y @@ -619,6 +619,44 @@ config MAX_STACK_SIZE_MB A sane initial value is 80 MB. +config ZCACHE + bool "Compressed cache for file pages (EXPERIMENTAL)" + depends on CRYPTO && CLEANCACHE + select CRYPTO_LZO + select ZBUD + default n + help + A compressed cache for file pages. + It takes active file pages that are in the process of being reclaimed + and attempts to compress them into a dynamically allocated RAM-based + memory pool. + + If this process is successful, when those file pages needed again, the + I/O reading operation was avoided. This results in a significant performance + gains under memory pressure for systems full with file pages. + +config BALANCE_ANON_FILE_RECLAIM + bool "During reclaim treat anon and file backed pages equally" + depends on SWAP + help + When performing memory reclaim treat anonymous and file backed pages + equally. + Swapping anonymous pages out to memory can be efficient enough to justify + treating anonymous and file backed pages equally. + +config KSWAPD_CPU_AFFINITY_MASK + string "kswapd cpu affinity mask" + depends on SMP + help + Set the cpu affinity for the kswapd task. + There can be power benefits on certain targets when limiting kswapd + to run only on certain cores. + The cpu affinity bitmask is represented by a hex string where commas + group hex digits into chunks. Each chunk defines exactly 32 bits of + the resultant bitmask. + For example to limit kswapd to the first 4 cores use the following: + CONFIG_KSWAPD_CPU_AFFINITY_MASK="f" + # For architectures that support deferred memory initialisation config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT bool @@ -668,3 +706,29 @@ config ZONE_DEVICE config FRAME_VECTOR bool + +config FORCE_ALLOC_FROM_DMA_ZONE + bool "Force certain memory allocators to always return ZONE_DMA memory" + depends on ZONE_DMA + help + Ensure certain memory allocators always return memory from ZONE_DMA. + This option helps ensure that clients who require ZONE_DMA memory are + always using ZONE_DMA memory. + + If unsure, say "n". + +config PROCESS_RECLAIM + bool "Enable process reclaim" + depends on PROC_FS + default n + help + It allows to reclaim pages of the process by /proc/pid/reclaim. + + (echo file > /proc/PID/reclaim) reclaims file-backed pages only. + (echo anon > /proc/PID/reclaim) reclaims anonymous pages only. + (echo all > /proc/PID/reclaim) reclaims all pages. + + (echo addr size-byte > /proc/PID/reclaim) reclaims pages in + (addr, addr + size-bytes) of the process. + + Any other vaule is ignored. diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 957d3da53ddd..7470fd60fc59 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). - This results in a large slowdown, but helps to find certain types - of memory corruption. + Depending on runtime enablement, this results in a small or large + slowdown, but helps to find certain types of memory corruption. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify @@ -26,5 +26,76 @@ config DEBUG_PAGEALLOC that would result in incorrect warnings of memory corruption after a resume because free pages are not saved to the suspend image. + By default this option will have a small overhead, e.g. by not + allowing the kernel mapping to be backed by large pages on some + architectures. Even bigger overhead comes when the debugging is + enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc + command line parameter. + +config DEBUG_PAGEALLOC_ENABLE_DEFAULT + bool "Enable debug page memory allocations by default?" + default n + depends on DEBUG_PAGEALLOC + ---help--- + Enable debug page memory allocations by default? This value + can be overridden by debug_pagealloc=off|on. + +config SLUB_DEBUG_PANIC_ON + bool "Enable to Panic on SLUB corruption detection" + depends on SLUB_DEBUG + help + SLUB has a resiliency feature enabled which restores bytes in + order for production environments to continue to operate. IN + debug options this may not be desirable as it prevents from + investigating the root cause which may be rooted within cache + or memory. + config PAGE_POISONING - bool + bool "Poison pages after freeing" + select PAGE_EXTENSION + select PAGE_POISONING_NO_SANITY if HIBERNATION + ---help--- + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages. The filling of the memory helps + reduce the risk of information leaks from freed data. This does + have a potential performance impact. + + Note that "poison" here is not the same thing as the "HWPoison" + for CONFIG_MEMORY_FAILURE. This is software poisoning only. + + If unsure, say N + +config PAGE_POISONING_ENABLE_DEFAULT + bool "Enable page poisoning by default?" + default n + depends on PAGE_POISONING + ---help--- + Enable page poisoning of free pages by default? This value + can be overridden by page_poison=off|on. This can be used + to avoid passing the kernel parameter and let page poisoning + feature enabled by default. + +config PAGE_POISONING_NO_SANITY + depends on PAGE_POISONING + bool "Only poison, don't sanity check" + ---help--- + Skip the sanity checking on alloc, only fill the pages with + poison on free. This reduces some of the overhead of the + poisoning feature. + + If you are only interested in sanitization, say Y. Otherwise + say N. + +config PAGE_POISONING_ZERO + bool "Use zero for poisoning instead of random data" + depends on PAGE_POISONING + ---help--- + Instead of using the existing poison value, fill the pages with + zeros. This makes it harder to detect when errors are occurring + due to sanitization but the zeroing at free means that it is + no longer necessary to write zeros when GFP_ZERO is used on + allocation. + + Enabling page poisoning with this option will disable hibernation + + If unsure, say N diff --git a/mm/Makefile b/mm/Makefile index ec91e951da28..04d48b46dbe9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -40,7 +40,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - debug.o $(mmu-y) + debug.o $(mmu-y) showmem.o vmpressure.o obj-y += init-mm.o @@ -56,9 +56,10 @@ ifdef CONFIG_MMU endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_ratio.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o +obj-$(CONFIG_ZCACHE) += zcache.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o @@ -67,7 +68,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o -obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o +obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KMEMCHECK) += kmemcheck.o @@ -79,7 +80,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o -obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_MEMCG) += memcontrol.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o @@ -100,4 +101,5 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_PROCESS_RECLAIM) += process_reclaim.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7f80b1a1bc34..3081f1234d4e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -237,6 +237,7 @@ static __init int bdi_class_init(void) bdi_class->dev_groups = bdi_dev_groups; bdi_debug_init(); + return 0; } postcore_initcall(bdi_class_init); @@ -780,6 +781,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->dev = NULL; + kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; @@ -795,6 +797,22 @@ int bdi_init(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_init); +struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) +{ + struct backing_dev_info *bdi; + + bdi = kmalloc_node(sizeof(struct backing_dev_info), + gfp_mask | __GFP_ZERO, node_id); + if (!bdi) + return NULL; + + if (bdi_init(bdi)) { + kfree(bdi); + return NULL; + } + return bdi; +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -875,12 +893,26 @@ void bdi_unregister(struct backing_dev_info *bdi) } } -void bdi_exit(struct backing_dev_info *bdi) +static void bdi_exit(struct backing_dev_info *bdi) { WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); } +static void release_bdi(struct kref *ref) +{ + struct backing_dev_info *bdi = + container_of(ref, struct backing_dev_info, refcnt); + + bdi_exit(bdi); + kfree(bdi); +} + +void bdi_put(struct backing_dev_info *bdi) +{ + kref_put(&bdi->refcnt, release_bdi); +} + void bdi_destroy(struct backing_dev_info *bdi) { bdi_unregister(bdi); diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 300117f1a08f..6c563a4846c4 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -70,7 +70,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) */ if (trylock_page(page)) { #ifdef CONFIG_BALLOON_COMPACTION - if (!PagePrivate(page)) { + if (PageIsolated(page)) { /* raced with isolation */ unlock_page(page); continue; @@ -106,110 +106,50 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); #ifdef CONFIG_BALLOON_COMPACTION -static inline void __isolate_balloon_page(struct page *page) +bool balloon_page_isolate(struct page *page, isolate_mode_t mode) + { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; spin_lock_irqsave(&b_dev_info->pages_lock, flags); - ClearPagePrivate(page); list_del(&page->lru); b_dev_info->isolated_pages++; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + + return true; } -static inline void __putback_balloon_page(struct page *page) +void balloon_page_putback(struct page *page) { struct balloon_dev_info *b_dev_info = balloon_page_device(page); unsigned long flags; spin_lock_irqsave(&b_dev_info->pages_lock, flags); - SetPagePrivate(page); list_add(&page->lru, &b_dev_info->pages); b_dev_info->isolated_pages--; spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } -/* __isolate_lru_page() counterpart for a ballooned page */ -bool balloon_page_isolate(struct page *page) -{ - /* - * Avoid burning cycles with pages that are yet under __free_pages(), - * or just got freed under us. - * - * In case we 'win' a race for a balloon page being freed under us and - * raise its refcount preventing __free_pages() from doing its job - * the put_page() at the end of this block will take care of - * release this page, thus avoiding a nasty leakage. - */ - if (likely(get_page_unless_zero(page))) { - /* - * As balloon pages are not isolated from LRU lists, concurrent - * compaction threads can race against page migration functions - * as well as race against the balloon driver releasing a page. - * - * In order to avoid having an already isolated balloon page - * being (wrongly) re-isolated while it is under migration, - * or to avoid attempting to isolate pages being released by - * the balloon driver, lets be sure we have the page lock - * before proceeding with the balloon page isolation steps. - */ - if (likely(trylock_page(page))) { - /* - * A ballooned page, by default, has PagePrivate set. - * Prevent concurrent compaction threads from isolating - * an already isolated balloon page by clearing it. - */ - if (balloon_page_movable(page)) { - __isolate_balloon_page(page); - unlock_page(page); - return true; - } - unlock_page(page); - } - put_page(page); - } - return false; -} - -/* putback_lru_page() counterpart for a ballooned page */ -void balloon_page_putback(struct page *page) -{ - /* - * 'lock_page()' stabilizes the page and prevents races against - * concurrent isolation threads attempting to re-isolate it. - */ - lock_page(page); - - if (__is_movable_balloon_page(page)) { - __putback_balloon_page(page); - /* drop the extra ref count taken for page isolation */ - put_page(page); - } else { - WARN_ON(1); - dump_page(page, "not movable balloon page"); - } - unlock_page(page); -} /* move_to_new_page() counterpart for a ballooned page */ -int balloon_page_migrate(struct page *newpage, - struct page *page, enum migrate_mode mode) +int balloon_page_migrate(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) { struct balloon_dev_info *balloon = balloon_page_device(page); - int rc = -EAGAIN; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - if (WARN_ON(!__is_movable_balloon_page(page))) { - dump_page(page, "not movable balloon page"); - return rc; - } + return balloon->migratepage(balloon, newpage, page, mode); +} - if (balloon && balloon->migratepage) - rc = balloon->migratepage(balloon, newpage, page, mode); +const struct address_space_operations balloon_aops = { + .migratepage = balloon_page_migrate, + .isolate_page = balloon_page_isolate, + .putback_page = balloon_page_putback, +}; +EXPORT_SYMBOL_GPL(balloon_aops); - return rc; -} #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/bootmem.c b/mm/bootmem.c index 3b6380784c28..90336470273c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -154,7 +154,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) * down, but we are still initializing the system. Pages are given directly * to the page allocator, no bootmem metadata is updated because it is gone. */ -void __init free_bootmem_late(unsigned long physaddr, unsigned long size) +void free_bootmem_late(unsigned long physaddr, unsigned long size) { unsigned long cursor, end; @@ -35,6 +35,7 @@ #include <linux/cma.h> #include <linux/highmem.h> #include <linux/io.h> +#include <linux/delay.h> #include <trace/events/cma.h> #include "cma.h" @@ -131,6 +132,10 @@ static int __init cma_activate_area(struct cma *cma) spin_lock_init(&cma->mem_head_lock); #endif + if (!PageHighMem(pfn_to_page(cma->base_pfn))) + kmemleak_free_part(__va(cma->base_pfn << PAGE_SHIFT), + cma->count << PAGE_SHIFT); + return 0; err: @@ -367,6 +372,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; int ret; + int retry_after_sleep = 0; if (!cma || !cma->count) return NULL; @@ -377,19 +383,40 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) if (!count) return NULL; + trace_cma_alloc_start(count, align); + mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); bitmap_count = cma_bitmap_pages_to_bits(cma, count); + if (bitmap_count > bitmap_maxno) + return NULL; + for (;;) { mutex_lock(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { - mutex_unlock(&cma->lock); - break; + if (retry_after_sleep < 2) { + start = 0; + /* + * Page may be momentarily pinned by some other + * process which has been scheduled out, eg. + * in exit path, during unmap call, or process + * fork and so cannot be freed there. Sleep + * for 100ms and retry twice to see if it has + * been freed later. + */ + mutex_unlock(&cma->lock); + msleep(100); + retry_after_sleep++; + continue; + } else { + mutex_unlock(&cma->lock); + break; + } } bitmap_set(cma->bitmap, bitmap_no, bitmap_count); /* @@ -414,6 +441,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); + + trace_cma_alloc_busy_retry(pfn, pfn_to_page(pfn), count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } diff --git a/mm/compaction.c b/mm/compaction.c index b6f145ed7ae1..86687ec1d034 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> */ +#include <linux/cpu.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/compaction.h> @@ -14,9 +15,11 @@ #include <linux/backing-dev.h> #include <linux/sysctl.h> #include <linux/sysfs.h> -#include <linux/balloon_compaction.h> #include <linux/page-isolation.h> #include <linux/kasan.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/page_owner.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -57,13 +60,27 @@ static unsigned long release_freepages(struct list_head *freelist) static void map_pages(struct list_head *list) { - struct page *page; + unsigned int i, order, nr_pages; + struct page *page, *next; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + + order = page_private(page); + nr_pages = 1 << order; + + post_alloc_hook(page, order, __GFP_MOVABLE); + if (order) + split_page(page, order); - list_for_each_entry(page, list, lru) { - arch_alloc_page(page, 0); - kernel_map_pages(page, 1, 1); - kasan_alloc_pages(page, 0); + for (i = 0; i < nr_pages; i++) { + list_add(&page->lru, &tmp_list); + page++; + } } + + list_splice(&tmp_list, list); } static inline bool migrate_async_suitable(int migratetype) @@ -116,6 +133,44 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn, #ifdef CONFIG_COMPACTION +int PageMovable(struct page *page) +{ + struct address_space *mapping; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (!__PageMovable(page)) + return 0; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops && mapping->a_ops->isolate_page) + return 1; + + return 0; +} +EXPORT_SYMBOL(PageMovable); + +void __SetPageMovable(struct page *page, struct address_space *mapping) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page); + page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE); +} +EXPORT_SYMBOL(__SetPageMovable); + +void __ClearPageMovable(struct page *page) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageMovable(page), page); + /* + * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE + * flag so that VM can catch up released page by driver after isolation. + * With it, VM migration doesn't try to put it back. + */ + page->mapping = (void *)((unsigned long)page->mapping & + PAGE_MAPPING_MOVABLE); +} +EXPORT_SYMBOL(__ClearPageMovable); + /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -403,12 +458,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long flags = 0; bool locked = false; unsigned long blockpfn = *start_pfn; + unsigned int order; cursor = pfn_to_page(blockpfn); /* Isolate free pages. */ for (; blockpfn < end_pfn; blockpfn++, cursor++) { - int isolated, i; + int isolated; struct page *page = cursor; /* @@ -474,17 +530,17 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, goto isolate_fail; } - /* Found a free page, break it into order-0 pages */ - isolated = split_free_page(page); + /* Found a free page, will break it into order-0 pages */ + order = page_order(page); + isolated = __isolate_free_page(page, order); if (!isolated) break; + set_page_private(page, order); total_isolated += isolated; cc->nr_freepages += isolated; - for (i = 0; i < isolated; i++) { - list_add(&page->lru, freelist); - page++; - } + list_add_tail(&page->lru, freelist); + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { blockpfn += isolated; break; @@ -603,7 +659,7 @@ isolate_freepages_range(struct compact_control *cc, */ } - /* split_free_page does not map the pages */ + /* __isolate_free_page() does not map the pages */ map_pages(&freelist); if (pfn < end_pfn) { @@ -632,21 +688,46 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc) mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); } -/* Similar to reclaim, but different enough that they don't share logic */ -static bool too_many_isolated(struct zone *zone) +static bool __too_many_isolated(struct zone *zone, int safe) { unsigned long active, inactive, isolated; - inactive = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_ANON); - active = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_FILE) + - zone_page_state(zone, NR_ISOLATED_ANON); + if (safe) { + inactive = zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_INACTIVE_ANON); + active = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ACTIVE_ANON); + isolated = zone_page_state_snapshot(zone, NR_ISOLATED_FILE) + + zone_page_state_snapshot(zone, NR_ISOLATED_ANON); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON); + active = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_FILE) + + zone_page_state(zone, NR_ISOLATED_ANON); + } return isolated > (inactive + active) / 2; } +/* Similar to reclaim, but different enough that they don't share logic */ +static bool too_many_isolated(struct compact_control *cc) +{ + /* + * __too_many_isolated(safe=0) is fast but inaccurate, because it + * doesn't account for the vm_stat_diff[] counters. So if it looks + * like too_many_isolated() is about to return true, fall back to the + * slower, more accurate zone_page_state_snapshot(). + */ + if (unlikely(__too_many_isolated(cc->zone, 0))) { + if (cc->mode != MIGRATE_ASYNC) + return __too_many_isolated(cc->zone, 1); + } + + return false; +} + /** * isolate_migratepages_block() - isolate all migrate-able pages within * a single pageblock @@ -683,7 +764,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * list by either parallel reclaimers or compaction. If there are, * delay for some time until fewer pages are isolated */ - while (unlikely(too_many_isolated(zone))) { + while (unlikely(too_many_isolated(cc))) { /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) return 0; @@ -699,7 +780,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { - bool is_lru; /* * Periodically drop the lock (if held) regardless of its @@ -740,21 +820,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* - * Check may be lockless but that's ok as we recheck later. - * It's possible to migrate LRU pages and balloon pages - * Skip any other type of page - */ - is_lru = PageLRU(page); - if (!is_lru) { - if (unlikely(balloon_page_movable(page))) { - if (balloon_page_isolate(page)) { - /* Successfully isolated */ - goto isolate_success; - } - } - } - - /* * Regardless of being on LRU, compound pages such as THP and * hugetlbfs are not to be compacted. We can potentially save * a lot of iterations if we skip them at once. The check is @@ -770,8 +835,30 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, continue; } - if (!is_lru) + /* + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU and non-lru movable pages. + * Skip any other type of page + */ + if (!PageLRU(page)) { + /* + * __PageMovable can return false positive so we need + * to verify it under page_lock. + */ + if (unlikely(__PageMovable(page)) && + !PageIsolated(page)) { + if (locked) { + spin_unlock_irqrestore(&zone->lru_lock, + flags); + locked = false; + } + + if (!isolate_movable_page(page, isolate_mode)) + goto isolate_success; + } + continue; + } /* * Migration will fail if an anonymous page is pinned in memory, @@ -1026,7 +1113,7 @@ static void isolate_freepages(struct compact_control *cc) } } - /* split_free_page does not map the pages */ + /* __isolate_free_page() does not map the pages */ map_pages(freelist); /* @@ -1218,11 +1305,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* * Mark that the PG_migrate_skip information should be cleared - * by kswapd when it goes to sleep. kswapd does not set the + * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ - if (!current_is_kswapd()) + if (cc->direct_compaction) zone->compact_blockskip_flush = true; return COMPACT_COMPLETE; @@ -1365,10 +1452,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. + * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + if (compaction_restarting(zone, cc->order)) __reset_isolation_suitable(zone); /* @@ -1504,6 +1590,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .mode = mode, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, + .direct_compaction = true, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1762,4 +1849,225 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + if (kthread_should_stop()) + return; + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c deleted file mode 100644 index 3b8f1b83610e..000000000000 --- a/mm/debug-pagealloc.c +++ /dev/null @@ -1,143 +0,0 @@ -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/page_ext.h> -#include <linux/poison.h> -#include <linux/ratelimit.h> - -static bool page_poisoning_enabled __read_mostly; - -static bool need_page_poisoning(void) -{ - if (!debug_pagealloc_enabled()) - return false; - - return true; -} - -static void init_page_poisoning(void) -{ - if (!debug_pagealloc_enabled()) - return; - - page_poisoning_enabled = true; -} - -struct page_ext_operations page_poisoning_ops = { - .need = need_page_poisoning, - .init = init_page_poisoning, -}; - -static inline void set_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (!page_ext) - return; - __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline void clear_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (!page_ext) - return; - __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline bool page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (!page_ext) - return false; - return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static void poison_page(struct page *page) -{ - void *addr = kmap_atomic(page); - - set_page_poison(page); - memset(addr, PAGE_POISON, PAGE_SIZE); - kunmap_atomic(addr); -} - -static void poison_pages(struct page *page, int n) -{ - int i; - - for (i = 0; i < n; i++) - poison_page(page + i); -} - -static bool single_bit_flip(unsigned char a, unsigned char b) -{ - unsigned char error = a ^ b; - - return error && !(error & (error - 1)); -} - -static void check_poison_mem(unsigned char *mem, size_t bytes) -{ - static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); - unsigned char *start; - unsigned char *end; - - start = memchr_inv(mem, PAGE_POISON, bytes); - if (!start) - return; - - for (end = mem + bytes - 1; end > start; end--) { - if (*end != PAGE_POISON) - break; - } - - if (!__ratelimit(&ratelimit)) - return; - else if (start == end && single_bit_flip(*start, PAGE_POISON)) - printk(KERN_ERR "pagealloc: single bit error\n"); - else - printk(KERN_ERR "pagealloc: memory corruption\n"); - - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, - end - start + 1, 1); - dump_stack(); -} - -static void unpoison_page(struct page *page) -{ - void *addr; - - if (!page_poison(page)) - return; - - addr = kmap_atomic(page); - check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); - kunmap_atomic(addr); -} - -static void unpoison_pages(struct page *page, int n) -{ - int i; - - for (i = 0; i < n; i++) - unpoison_page(page + i); -} - -void __kernel_map_pages(struct page *page, int numpages, int enable) -{ - if (!page_poisoning_enabled) - return; - - if (enable) - unpoison_pages(page, numpages); - else - poison_pages(page, numpages); -} diff --git a/mm/debug.c b/mm/debug.c index 668aa35191ca..3621385c09ac 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -9,6 +9,18 @@ #include <linux/mm.h> #include <linux/trace_events.h> #include <linux/memcontrol.h> +#include <linux/migrate.h> +#include <linux/page_owner.h> + +char *migrate_reason_names[MR_TYPES] = { + "compaction", + "memory_failure", + "memory_hotplug", + "syscall_or_cpuset", + "mempolicy_mbind", + "numa_misplaced", + "cma", +}; static const struct trace_print_flags pageflag_names[] = { {1UL << PG_locked, "locked" }, @@ -47,6 +59,9 @@ static const struct trace_print_flags pageflag_names[] = { {1UL << PG_young, "young" }, {1UL << PG_idle, "idle" }, #endif +#ifdef CONFIG_ZCACHE + {1UL << PG_was_active, "was_active" }, +#endif }; static void dump_flags(unsigned long flags, @@ -103,6 +118,7 @@ void dump_page_badflags(struct page *page, const char *reason, void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); + dump_page_owner(page); } EXPORT_SYMBOL(dump_page); diff --git a/mm/filemap.c b/mm/filemap.c index b15f1d8bba43..750af2219081 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -239,10 +239,12 @@ void __delete_from_page_cache(struct page *page, void *shadow, * invalidate any existing cleancache entries. We can't leave * stale data around in the cleancache once our page is gone */ - if (PageUptodate(page) && PageMappedToDisk(page)) + if (PageUptodate(page) && PageMappedToDisk(page)) { + count_vm_event(PGPGOUTCLEAN); cleancache_put_page(page); - else + } else { cleancache_invalidate_page(mapping, page); + } page_cache_tree_delete(mapping, page, shadow); diff --git a/mm/internal.h b/mm/internal.h index f63f4393d633..e17af58d2bf7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -182,6 +182,8 @@ extern void prep_compound_page(struct page *page, unsigned int order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); #endif +extern void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags); extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -206,6 +208,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const int alloc_flags; /* alloc flags of a direct compactor */ @@ -310,10 +313,8 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); -#endif #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index e1100433cefe..2976a9ee104f 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,4 +1,5 @@ KASAN_SANITIZE := n +UBSAN_SANITIZE_kasan.o := n KCOV_INSTRUMENT := n CFLAGS_REMOVE_kasan.o = -pg diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b0fe986a2856..ddc6c93966d2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -223,8 +223,18 @@ static unsigned long jiffies_min_age; static unsigned long jiffies_last_scan; /* delay between automatic memory scannings */ static signed long jiffies_scan_wait; -/* enables or disables the task stacks scanning */ + +/* Enables or disables the task stacks scanning. + * Set to 1 if at compile time we want it enabled. + * Else set to 0 to have it disabled by default. + * This can be enabled by writing to "stack=on" using + * kmemleak debugfs entry.*/ +#ifdef CONFIG_DEBUG_TASK_STACK_SCAN_OFF +static int kmemleak_stack_scan; +#else static int kmemleak_stack_scan = 1; +#endif + /* protects the memory scanning, parameters and debug/kmemleak file access */ static DEFINE_MUTEX(scan_mutex); /* setting kmemleak=on, will set this var, skipping the disable */ @@ -37,6 +37,7 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/show_mem_notifier.h> #include <asm/tlbflush.h> #include "internal.h" @@ -223,6 +224,9 @@ static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; +/* Boolean to indicate whether to use deferred timer or not */ +static bool use_deferred_timer; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -236,7 +240,7 @@ static int ksm_nr_node_ids = 1; #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 #define KSM_RUN_OFFLINE 4 -static unsigned long ksm_run = KSM_RUN_STOP; +static unsigned long ksm_run = KSM_RUN_MERGE; static void wait_while_offlining(void); static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); @@ -247,6 +251,20 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock); sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) +static int ksm_show_mem_notifier(struct notifier_block *nb, + unsigned long action, + void *data) +{ + pr_info("ksm_pages_sharing: %lu\n", ksm_pages_sharing); + pr_info("ksm_pages_shared: %lu\n", ksm_pages_shared); + + return 0; +} + +static struct notifier_block ksm_show_mem_notifier_block = { + .notifier_call = ksm_show_mem_notifier, +}; + static int __init ksm_slab_init(void) { rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); @@ -541,8 +559,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) void *expected_mapping; unsigned long kpfn; - expected_mapping = (void *)stable_node + - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + expected_mapping = (void *)((unsigned long)stable_node | + PAGE_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); page = pfn_to_page(kpfn); @@ -1725,6 +1743,41 @@ static void ksm_do_scan(unsigned int scan_npages) } } +static void process_timeout(unsigned long __data) +{ + wake_up_process((struct task_struct *)__data); +} + +static signed long __sched deferred_schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + __set_current_state(TASK_INTERRUPTIBLE); + if (timeout < 0) { + pr_err("schedule_timeout: wrong timeout value %lx\n", + timeout); + __set_current_state(TASK_RUNNING); + goto out; + } + + expire = timeout + jiffies; + + setup_deferrable_timer_on_stack(&timer, process_timeout, + (unsigned long)current); + mod_timer(&timer, expire); + schedule(); + del_singleshot_timer_sync(&timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer); + + timeout = expire - jiffies; + +out: + return timeout < 0 ? 0 : timeout; +} + static int ksmd_should_run(void) { return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); @@ -1745,7 +1798,11 @@ static int ksm_scan_thread(void *nothing) try_to_freeze(); if (ksmd_should_run()) { - schedule_timeout_interruptible( + if (use_deferred_timer) + deferred_schedule_timeout( + msecs_to_jiffies(ksm_thread_sleep_millisecs)); + else + schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { wait_event_freezable(ksm_thread_wait, @@ -1928,6 +1985,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) stable_node = page_stable_node(page); if (!stable_node) return ret; + again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; @@ -2197,6 +2255,26 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, } KSM_ATTR(run); +static ssize_t deferred_timer_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, 8, "%d\n", use_deferred_timer); +} + +static ssize_t deferred_timer_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long enable; + int err; + + err = kstrtoul(buf, 10, &enable); + use_deferred_timer = enable; + + return count; +} +KSM_ATTR(deferred_timer); + #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -2309,6 +2387,7 @@ static struct attribute *ksm_attrs[] = { &pages_unshared_attr.attr, &pages_volatile_attr.attr, &full_scans_attr.attr, + &deferred_timer_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif @@ -2353,6 +2432,8 @@ static int __init ksm_init(void) /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, 100); #endif + + show_mem_notifier_register(&ksm_show_mem_notifier_block); return 0; out_free: diff --git a/mm/maccess.c b/mm/maccess.c index d159b1c96e48..78f9274dd49d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) pagefault_disable(); do { - ret = __copy_from_user_inatomic(dst++, - (const void __user __force *)src++, 1); + ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); dst[-1] = '\0'; diff --git a/mm/memblock.c b/mm/memblock.c index 99c7f493d45f..e39ef2fe5c17 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -19,6 +19,9 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/memblock.h> +#include <linux/preempt.h> +#include <linux/seqlock.h> +#include <linux/irqflags.h> #include <asm-generic/sections.h> #include <linux/io.h> @@ -31,6 +34,7 @@ static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIO static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock; #endif +static seqcount_t memblock_seq; struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, .memory.cnt = 1, /* empty dummy entry */ @@ -733,7 +737,8 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) (unsigned long long)base + size - 1, (void *)_RET_IP_); - kmemleak_free_part(__va(base), size); + if (base < memblock.current_limit) + kmemleak_free_part(__va(base), size); return memblock_remove_range(&memblock.reserved, base, size); } @@ -834,6 +839,16 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) } /** + * memblock_clear_nomap - Clear a flag of MEMBLOCK_NOMAP memory region + * @base: the base phys addr of the region + * @size: the size of the region + */ +int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); +} + +/** * __next_reserved_mem_region - next function for for_each_reserved_region() * @idx: pointer to u64 loop variable * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL @@ -1169,7 +1184,8 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, * The min_count is set to 0 so that memblock allocations are * never reported as leaks. */ - kmemleak_alloc(__va(found), size, 0, 0); + if (found < memblock.current_limit) + kmemleak_alloc(__va(found), size, 0, 0); return found; } return 0; @@ -1509,7 +1525,7 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) (phys_addr_t)ULLONG_MAX); } -static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) +static int __init_memblock __memblock_search(struct memblock_type *type, phys_addr_t addr) { unsigned int left = 0, right = type->cnt; @@ -1527,6 +1543,19 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr return -1; } +static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) +{ + int ret; + unsigned long seq; + + do { + seq = raw_read_seqcount_begin(&memblock_seq); + ret = __memblock_search(type, addr); + } while (unlikely(read_seqcount_retry(&memblock_seq, seq))); + + return ret; +} + int __init memblock_is_reserved(phys_addr_t addr) { return memblock_search(&memblock.reserved, addr) != -1; @@ -1585,6 +1614,14 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size memblock.memory.regions[idx].size) >= end; } +bool __init_memblock memblock_overlaps_memory(phys_addr_t base, + phys_addr_t size) +{ + memblock_cap_size(base, &size); + + return memblock_overlaps_region(&memblock.memory, base, size); +} + /** * memblock_is_region_reserved - check if a region intersects reserved memory * @base: base of region to check @@ -1701,6 +1738,37 @@ void __init memblock_allow_resize(void) memblock_can_resize = 1; } +static unsigned long __init_memblock +memblock_resize_late(int begin, unsigned long flags) +{ + static int memblock_can_resize_old; + + if (begin) { + preempt_disable(); + local_irq_save(flags); + memblock_can_resize_old = memblock_can_resize; + memblock_can_resize = 0; + raw_write_seqcount_begin(&memblock_seq); + } else { + raw_write_seqcount_end(&memblock_seq); + memblock_can_resize = memblock_can_resize_old; + local_irq_restore(flags); + preempt_enable(); + } + + return flags; +} + +unsigned long __init_memblock memblock_region_resize_late_begin(void) +{ + return memblock_resize_late(1, 0); +} + +void __init_memblock memblock_region_resize_late_end(unsigned long flags) +{ + memblock_resize_late(0, flags); +} + static int __init early_memblock(char *p) { if (p && strstr(p, "debug")) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 92a647957f91..d4271ebc37d0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1016,7 +1016,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(hpage, ttu); + ret = try_to_unmap(hpage, ttu, NULL); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); diff --git a/mm/memory.c b/mm/memory.c index 177cb7d111a9..78ab57141731 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2634,7 +2634,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if ((PageSwapCache(page) && vm_swap_full(page_swap_info(page))) || + (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (page != swapcache) { @@ -2849,7 +2850,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, } static unsigned long fault_around_bytes __read_mostly = - rounddown_pow_of_two(65536); + rounddown_pow_of_two(4096); #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4d91216dcab2..191b9eff7fc3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -32,6 +32,7 @@ #include <linux/hugetlb.h> #include <linux/memblock.h> #include <linux/bootmem.h> +#include <linux/compaction.h> #include <asm/tlbflush.h> @@ -44,7 +45,7 @@ * and restore_online_page_callback() for generic callback restore. */ -static void generic_online_page(struct page *page); +static int generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -856,11 +857,12 @@ void __online_page_free(struct page *page) } EXPORT_SYMBOL_GPL(__online_page_free); -static void generic_online_page(struct page *page) +static int generic_online_page(struct page *page) { __online_page_set_limits(page); __online_page_increment_counters(page); __online_page_free(page); + return 0; } static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, @@ -869,11 +871,13 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; struct page *page; + int ret; if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); - (*online_page_callback)(page); - onlined_pages++; + ret = (*online_page_callback)(page); + if (!ret) + onlined_pages++; } *(unsigned long *)arg = onlined_pages; return 0; @@ -1012,7 +1016,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = pfn_to_nid(pfn); + nid = zone_to_nid(zone); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -1052,7 +1056,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ pgdat_resize_unlock(zone->zone_pgdat, &flags); if (onlined_pages) { - node_states_set_node(zone_to_nid(zone), &arg); + node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL, NULL); else @@ -1063,8 +1067,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); - if (onlined_pages) - kswapd_run(zone_to_nid(zone)); + if (onlined_pages) { + kswapd_run(nid); + kcompactd_run(nid); + } vm_total_pages = nr_free_pagecache_pages(); @@ -1417,10 +1423,10 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, } /* - * Scan pfn range [start,end) to find movable/migratable pages (LRU pages - * and hugepages). We scan pfn because it's much easier than scanning over - * linked list. This function returns the pfn of the first found movable - * page if it's found, otherwise 0. + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, + * non-lru movable pages and hugepages). We scan pfn because it's much + * easier than scanning over linked list. This function returns the pfn + * of the first found movable page if it's found, otherwise 0. */ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { @@ -1431,6 +1437,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) page = pfn_to_page(pfn); if (PageLRU(page)) return pfn; + if (__PageMovable(page)) + return pfn; if (PageHuge(page)) { if (page_huge_active(page)) return pfn; @@ -1474,22 +1482,24 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!get_page_unless_zero(page)) continue; /* - * We can skip free pages. And we can only deal with pages on - * LRU. + * We can skip free pages. And we can deal with pages on + * LRU and non-lru movable pages. */ - ret = isolate_lru_page(page); + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); if (!ret) { /* Success */ put_page(page); list_add_tail(&page->lru, &source); move_pages--; - inc_zone_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); - + if (!__PageMovable(page)) + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } else { #ifdef CONFIG_DEBUG_VM - printk(KERN_ALERT "removing pfn %lx from LRU failed\n", - pfn); - dump_page(page, "failed to remove from LRU"); + pr_alert("failed to isolate pfn %lx\n", pfn); + dump_page(page, "isolation failed"); #endif put_page(page); /* Because we don't have big zone->lock. we should @@ -1842,8 +1852,10 @@ repeat: zone_pcp_update(zone); node_states_clear_node(node, &arg); - if (arg.status_change_nid >= 0) + if (arg.status_change_nid >= 0) { kswapd_stop(node); + kcompactd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/memtest.c b/mm/memtest.c index 8eaa4c3a5f65..15a423eb0c29 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -80,8 +80,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) } /* default is disabled */ -static unsigned int memtest_pattern __initdata; - +static unsigned int memtest_pattern __initdata = + CONFIG_MEMTEST_ENABLE_DEFAULT; static int __init parse_memtest(char *arg) { int ret = 0; diff --git a/mm/migrate.c b/mm/migrate.c index afedcfab60e2..921cf12b03ce 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -31,6 +31,7 @@ #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/backing-dev.h> +#include <linux/compaction.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> @@ -38,6 +39,7 @@ #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> +#include <linux/page_owner.h> #include <linux/ptrace.h> #include <asm/tlbflush.h> @@ -73,6 +75,81 @@ int migrate_prep_local(void) return 0; } +int isolate_movable_page(struct page *page, isolate_mode_t mode) +{ + struct address_space *mapping; + + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a movable page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (unlikely(!get_page_unless_zero(page))) + goto out; + + /* + * Check PageMovable before holding a PG_lock because page's owner + * assumes anybody doesn't touch PG_lock of newly allocated page + * so unconditionally grapping the lock ruins page's owner side. + */ + if (unlikely(!__PageMovable(page))) + goto out_putpage; + /* + * As movable pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the releasing a page. + * + * In order to avoid having an already isolated movable page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released, + * lets be sure we have the page lock + * before proceeding with the movable page isolation steps. + */ + if (unlikely(!trylock_page(page))) + goto out_putpage; + + if (!PageMovable(page) || PageIsolated(page)) + goto out_no_isolated; + + mapping = page_mapping(page); + VM_BUG_ON_PAGE(!mapping, page); + + if (!mapping->a_ops->isolate_page(page, mode)) + goto out_no_isolated; + + /* Driver shouldn't use PG_isolated bit of page->flags */ + WARN_ON_ONCE(PageIsolated(page)); + __SetPageIsolated(page); + unlock_page(page); + + return 0; + +out_no_isolated: + unlock_page(page); +out_putpage: + put_page(page); +out: + return -EBUSY; +} + +/* It should be called on page which is PG_movable */ +void putback_movable_page(struct page *page) +{ + struct address_space *mapping; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + mapping = page_mapping(page); + mapping->a_ops->putback_page(page); + __ClearPageIsolated(page); +} + /* * Put previously isolated pages back onto the appropriate lists * from where they were once taken off for compaction/migration. @@ -94,10 +171,23 @@ void putback_movable_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (unlikely(isolated_balloon_page(page))) - balloon_page_putback(page); - else + /* + * We isolated non-lru movable page so here we can use + * __PageMovable because LRU page's mapping cannot have + * PAGE_MAPPING_MOVABLE. + */ + if (unlikely(__PageMovable(page))) { + VM_BUG_ON_PAGE(!PageIsolated(page), page); + lock_page(page); + if (PageMovable(page)) + putback_movable_page(page); + else + __ClearPageIsolated(page); + unlock_page(page); + put_page(page); + } else { putback_lru_page(page); + } } } @@ -580,6 +670,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) */ if (PageWriteback(newpage)) end_page_writeback(newpage); + + copy_page_owner(page, newpage); } EXPORT_SYMBOL(migrate_page_copy); @@ -588,7 +680,7 @@ EXPORT_SYMBOL(migrate_page_copy); ***********************************************************/ /* - * Common logic to directly migrate a single page suitable for + * Common logic to directly migrate a single LRU page suitable for * pages that do not use PagePrivate/PagePrivate2. * * Pages are locked upon entry and exit. @@ -751,24 +843,47 @@ static int move_to_new_page(struct page *newpage, struct page *page, enum migrate_mode mode) { struct address_space *mapping; - int rc; + int rc = -EAGAIN; + bool is_lru = !__PageMovable(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); mapping = page_mapping(page); - if (!mapping) - rc = migrate_page(mapping, newpage, page, mode); - else if (mapping->a_ops->migratepage) + + if (likely(is_lru)) { + if (!mapping) + rc = migrate_page(mapping, newpage, page, mode); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * provide a migratepage callback. Anonymous pages + * are part of swap space which also has its own + * migratepage callback. This is the most common path + * for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, newpage, + page, mode); + else + rc = fallback_migrate_page(mapping, newpage, + page, mode); + } else { /* - * Most pages have a mapping and most filesystems provide a - * migratepage callback. Anonymous pages are part of swap - * space which also has its own migratepage callback. This - * is the most common path for page migration. + * In case of non-lru page, it could be released after + * isolation step. In that case, we shouldn't try migration. */ - rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); - else - rc = fallback_migrate_page(mapping, newpage, page, mode); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + if (!PageMovable(page)) { + rc = MIGRATEPAGE_SUCCESS; + __ClearPageIsolated(page); + goto out; + } + + rc = mapping->a_ops->migratepage(mapping, newpage, + page, mode); + WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && + !PageIsolated(page)); + } /* * When successful, old pagecache page->mapping must be cleared before @@ -776,9 +891,25 @@ static int move_to_new_page(struct page *newpage, struct page *page, */ if (rc == MIGRATEPAGE_SUCCESS) { set_page_memcg(page, NULL); - if (!PageAnon(page)) + if (__PageMovable(page)) { + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + /* + * We clear PG_movable under page_lock so any compactor + * cannot try to migrate this page. + */ + __ClearPageIsolated(page); + } + + /* + * Anonymous and movable page->mapping will be cleard by + * free_pages_prepare so don't reset it here for keeping + * the type to work PageAnon, for example. + */ + if (!PageMappingFlags(page)) page->mapping = NULL; } +out: return rc; } @@ -788,6 +919,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; + bool is_lru = !__PageMovable(page); if (!trylock_page(page)) { if (!force || mode == MIGRATE_ASYNC) @@ -856,15 +988,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (unlikely(!trylock_page(newpage))) goto out_unlock; - if (unlikely(isolated_balloon_page(page))) { - /* - * A ballooned page does not need any special attention from - * physical to virtual reverse mapping procedures. - * Skip any attempt to unmap PTEs or to remap swap cache, - * in order to avoid burning cycles at rmap level, and perform - * the page migration right away (proteced by page lock). - */ - rc = balloon_page_migrate(newpage, page, mode); + if (unlikely(!is_lru)) { + rc = move_to_new_page(newpage, page, mode); goto out_unlock_both; } @@ -891,7 +1016,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); try_to_unmap(page, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS, NULL); page_was_mapped = 1; } @@ -910,6 +1035,19 @@ out_unlock: put_anon_vma(anon_vma); unlock_page(page); out: + /* + * If migration is successful, decrease refcount of the newpage + * which will not free the page because new page owner increased + * refcounter. As well, if it is LRU page, add the page to LRU + * list in here. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + if (unlikely(__PageMovable(newpage))) + put_page(newpage); + else + putback_lru_page(newpage); + } + return rc; } @@ -943,6 +1081,18 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ + ClearPageActive(page); + ClearPageUnevictable(page); + if (unlikely(__PageMovable(page))) { + lock_page(page); + if (!PageMovable(page)) + __ClearPageIsolated(page); + unlock_page(page); + } + if (put_new_page) + put_new_page(newpage, private); + else + put_page(newpage); goto out; } @@ -951,8 +1101,9 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) - put_new_page = NULL; + if (rc == MIGRATEPAGE_SUCCESS) { + set_page_owner_migrate_reason(newpage, reason); + } out: if (rc != -EAGAIN) { @@ -965,33 +1116,45 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { + } + + /* + * If migration is successful, releases reference grabbed during + * isolation. Otherwise, restore the page to right list unless + * we want to retry. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + put_page(page); + if (reason == MR_MEMORY_FAILURE) { /* - * With this release, we free successfully migrated - * page and set PG_HWPoison on just freed page - * intentionally. Although it's rather weird, it's how - * HWPoison flag works at the moment. + * Set PG_HWPoison on just freed page + * intentionally. Although it's rather weird, + * it's how HWPoison flag works at the moment. */ - put_page(page); if (!test_set_page_hwpoison(page)) num_poisoned_pages_inc(); - } else - putback_lru_page(page); - } + } + } else { + if (rc != -EAGAIN) { + if (likely(!__PageMovable(page))) { + putback_lru_page(page); + goto put_new; + } - /* - * If migration was not successful and there's a freeing callback, use - * it. Otherwise, putback_lru_page() will drop the reference grabbed - * during isolation. - */ - if (put_new_page) - put_new_page(newpage, private); - else if (unlikely(__is_movable_balloon_page(newpage))) { - /* drop our reference, page already in the balloon */ - put_page(newpage); - } else - putback_lru_page(newpage); + lock_page(page); + if (PageMovable(page)) + putback_movable_page(page); + else + __ClearPageIsolated(page); + unlock_page(page); + put_page(page); + } +put_new: + if (put_new_page) + put_new_page(newpage, private); + else + put_page(newpage); + } if (result) { if (rc) @@ -1023,7 +1186,7 @@ out: static int unmap_and_move_huge_page(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *hpage, int force, - enum migrate_mode mode) + enum migrate_mode mode, int reason) { int rc = -EAGAIN; int *result = NULL; @@ -1061,7 +1224,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_mapped(hpage)) { try_to_unmap(hpage, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS, NULL); page_was_mapped = 1; } @@ -1081,6 +1244,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); put_new_page = NULL; + set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); @@ -1141,6 +1305,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int swapwrite = current->flags & PF_SWAPWRITE; int rc; + trace_mm_migrate_pages_start(mode, reason); + if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -1153,7 +1319,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, - pass > 2, mode); + pass > 2, mode, reason); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode, @@ -1837,6 +2003,7 @@ fail_putback: set_page_memcg(new_page, page_memcg(page)); set_page_memcg(page, NULL); page_remove_rmap(page); + set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/mmap.c b/mm/mmap.c index 9c9ae859fbbb..2339b533f4b2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -48,6 +48,10 @@ #include <asm/tlb.h> #include <asm/mmu_context.h> +#ifdef CONFIG_MSM_APP_SETTINGS +#include <asm/app_api.h> +#endif + #include "internal.h" #ifndef arch_mmap_check @@ -1311,6 +1315,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!len) return -EINVAL; +#ifdef CONFIG_MSM_APP_SETTINGS + if (use_app_setting) + apply_app_setting_bit(file); +#endif + /* * Does the application expect PROT_READ to imply PROT_EXEC? * diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e57cf24babd6..a81d521db56a 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -76,7 +76,7 @@ again: * down, but we are still initializing the system. Pages are given directly * to the page allocator, no bootmem metadata is updated because it is gone. */ -void __init free_bootmem_late(unsigned long addr, unsigned long size) +void free_bootmem_late(unsigned long addr, unsigned long size) { unsigned long cursor, end; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6c1b9c43cc23..67237b7cb177 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -350,7 +350,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc, * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, * swapents, oom_score_adj value, and name. */ -static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) +void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3309dbda7ffa..a98dae1bdcff 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) unsigned long nr_pages; nr_pages = zone_page_state(zone, NR_FREE_PAGES); - nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + nr_pages -= min(nr_pages, zone->totalreserve_pages); nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); @@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void) unsigned long x; x = global_page_state(NR_FREE_PAGES); - x -= min(x, dirty_balance_reserve); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + x -= min(x, totalreserve_pages); x += global_page_state(NR_INACTIVE_FILE); x += global_page_state(NR_ACTIVE_FILE); @@ -1944,6 +1954,12 @@ void throttle_vm_writeout(gfp_t gfp_mask) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; + /* Try safe version */ + else if (unlikely(global_page_state_snapshot(NR_UNSTABLE_NFS) + + global_page_state_snapshot(NR_WRITEBACK) <= + dirty_thresh)) + break; + congestion_wait(BLK_RW_ASYNC, HZ/10); /* @@ -1978,11 +1994,11 @@ void laptop_mode_timer_fn(unsigned long data) * We want to write everything out, not just down to the dirty * threshold */ - if (!bdi_has_dirty_io(&q->backing_dev_info)) + if (!bdi_has_dirty_io(q->backing_dev_info)) return; rcu_read_lock(); - list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node) + list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node) if (wb_has_dirty_io(wb)) wb_start_writeback(wb, nr_pages, true, WB_REASON_LAPTOP_TIMER); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cebaf3cc75d4..bad5f32a9765 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -114,13 +114,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -/* - * When calculating the number of globally allowed dirty pages, there - * is a certain number of per-zone reserves that should not be - * considered dirtyable memory. This is the sum of those reserves - * over all existing zones that contribute dirtyable memory. - */ -unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -230,6 +223,20 @@ static char * const zone_names[MAX_NR_ZONES] = { }; static void free_compound_page(struct page *page); + +char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Movable", + "Reclaimable", +#ifdef CONFIG_CMA + "CMA", +#endif + "HighAtomic", +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, @@ -475,6 +482,7 @@ static void bad_page(struct page *page, const char *reason, printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); dump_page_badflags(page, reason, bad_flags); + dump_page_owner(page); print_modules(); dump_stack(); @@ -521,7 +529,8 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly; +bool _debug_pagealloc_enabled __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) @@ -532,6 +541,9 @@ static int __init early_debug_pagealloc(char *buf) if (strcmp(buf, "on") == 0) _debug_pagealloc_enabled = true; + if (strcmp(buf, "off") == 0) + _debug_pagealloc_enabled = false; + return 0; } early_param("debug_pagealloc", early_debug_pagealloc); @@ -1028,9 +1040,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); - kasan_free_pages(page, order); - if (PageAnon(page)) + if (PageMappingFlags(page)) page->mapping = NULL; bad += free_pages_check(page); for (i = 1; i < (1 << order); i++) { @@ -1050,7 +1061,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order) PAGE_SIZE << order); } arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); + kasan_free_pages(page, order); return true; } @@ -1071,8 +1084,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, - unsigned long pfn, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1144,7 +1156,7 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, #endif -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, +void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order) { if (early_page_uninitialised(pfn)) @@ -1322,6 +1334,11 @@ void __init page_alloc_init_late(void) #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_CMA +bool is_cma_pageblock(struct page *page) +{ + return get_pageblock_migratetype(page) == MIGRATE_CMA; +} + /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { @@ -1429,8 +1446,27 @@ static inline int check_new_page(struct page *page) return 0; } +static inline bool free_pages_prezeroed(void) +{ + return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled(); +} + +inline void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags) +{ + set_page_private(page, 0); + set_page_refcounted(page); + + kasan_alloc_pages(page, order); + arch_alloc_page(page, order); + kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); + set_page_owner(page, order, gfp_flags); +} + static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - int alloc_flags) + int alloc_flags) { int i; @@ -1440,22 +1476,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, return 1; } - set_page_private(page, 0); - set_page_refcounted(page); - - arch_alloc_page(page, order); - kernel_map_pages(page, 1 << order, 1); - kasan_alloc_pages(page, order); + post_alloc_hook(page, order, gfp_flags); - if (gfp_flags & __GFP_ZERO) + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); - set_page_owner(page, order, gfp_flags); - /* * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking @@ -1518,6 +1547,11 @@ static int fallbacks[MIGRATE_TYPES][4] = { #endif }; +int *get_migratetype_fallbacks(int mtype) +{ + return fallbacks[mtype]; +} + #ifdef CONFIG_CMA static struct page *__rmqueue_cma_fallback(struct zone *zone, unsigned int order) @@ -1835,7 +1869,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_entry(area->free_list[fallback_mt].next, struct page, lru); - if (can_steal) + if (can_steal && + get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -1874,17 +1909,30 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { - if (migratetype == MIGRATE_MOVABLE) - page = __rmqueue_cma_fallback(zone, order); - - if (!page) - page = __rmqueue_fallback(zone, order, migratetype); + page = __rmqueue_fallback(zone, order, migratetype); } trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } +#ifdef CONFIG_CMA +static struct page *__rmqueue_cma(struct zone *zone, unsigned int order) +{ + struct page *page = 0; + if (IS_ENABLED(CONFIG_CMA)) + if (!zone->cma_alloc) + page = __rmqueue_cma_fallback(zone, order); + trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA); + return page; +} +#else +static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order) +{ + return NULL; +} +#endif + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -1898,7 +1946,17 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype, 0); + struct page *page; + + /* + * If migrate type CMA is being requested only try to + * satisfy the request with CMA pages to try and increase + * CMA utlization. + */ + if (is_migrate_cma(migratetype)) + page = __rmqueue_cma(zone, order); + else + page = __rmqueue(zone, order, migratetype, 0); if (unlikely(page == NULL)) break; @@ -1925,6 +1983,28 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return i; } +/* + * Return the pcp list that corresponds to the migrate type if that list isn't + * empty. + * If the list is empty return NULL. + */ +static struct list_head *get_populated_pcp_list(struct zone *zone, + unsigned int order, struct per_cpu_pages *pcp, + int migratetype, int cold) +{ + struct list_head *list = &pcp->lists[migratetype]; + + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, order, + pcp->batch, list, + migratetype, cold); + + if (list_empty(list)) + list = NULL; + } + return list; +} + #ifdef CONFIG_NUMA /* * Called from the vmstat counter updater to drain pagesets of this @@ -2172,7 +2252,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; - gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -2186,12 +2265,9 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - gfp_mask = get_page_owner_gfp(page); - set_page_owner(page, 0, gfp_mask); - for (i = 1; i < (1 << order); i++) { + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - set_page_owner(page + i, 0, gfp_mask); - } + split_page_owner(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -2209,7 +2285,8 @@ int __isolate_free_page(struct page *page, unsigned int order) if (!is_migrate_isolate(mt)) { /* Obey watermarks as if the page was being allocated */ watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + if (!is_migrate_cma(mt) && + !zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -2220,14 +2297,13 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); - set_page_owner(page, order, __GFP_MOVABLE); - /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) + && mt != MIGRATE_HIGHATOMIC) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -2238,33 +2314,6 @@ int __isolate_free_page(struct page *page, unsigned int order) } /* - * Similar to split_page except the page is already free. As this is only - * being used for migration, the migratetype of the block also changes. - * As this is called with interrupts disabled, the caller is responsible - * for calling arch_alloc_page() and kernel_map_page() after interrupts - * are enabled. - * - * Note: this is probably too low level an operation for use in drivers. - * Please consult with lkml before using this in your driver. - */ -int split_free_page(struct page *page) -{ - unsigned int order; - int nr_pages; - - order = page_order(page); - - nr_pages = __isolate_free_page(page, order); - if (!nr_pages) - return 0; - - /* Split into individual pages */ - set_page_refcounted(page); - split_page(page, order); - return nr_pages; -} - -/* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline @@ -2273,21 +2322,32 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, gfp_t gfp_flags, int alloc_flags, int migratetype) { unsigned long flags; - struct page *page; + struct page *page = NULL; bool cold = ((gfp_flags & __GFP_COLD) != 0); if (likely(order == 0)) { struct per_cpu_pages *pcp; - struct list_head *list; + struct list_head *list = NULL; local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) + + /* First try to get CMA pages */ + if (migratetype == MIGRATE_MOVABLE && + gfp_flags & __GFP_CMA) { + list = get_populated_pcp_list(zone, 0, pcp, + get_cma_migrate_type(), cold); + } + + if (list == NULL) { + /* + * Either CMA is not suitable or there are no free CMA + * pages. + */ + list = get_populated_pcp_list(zone, 0, pcp, + migratetype, cold); + if (unlikely(list == NULL) || + unlikely(list_empty(list))) goto failed; } @@ -2320,8 +2380,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } + if (!page && migratetype == MIGRATE_MOVABLE && + gfp_flags & __GFP_CMA) + page = __rmqueue_cma(zone, order); + if (!page) page = __rmqueue(zone, order, migratetype, gfp_flags); + spin_unlock(&zone->lock); if (!page) goto failed; @@ -2481,6 +2546,14 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, continue; for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { +#ifdef CONFIG_CMA + /* + * Note that this check is needed only + * when MIGRATE_CMA < MIGRATE_PCPTYPES. + */ + if (mt == MIGRATE_CMA) + continue; +#endif if (!list_empty(&area->free_list[mt])) return true; } @@ -5266,6 +5339,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pgdat->kcompactd_wait); +#endif pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { @@ -6036,20 +6112,12 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; + + zone->totalreserve_pages = max; + reserve_pages += max; - /* - * Lowmem reserves are not available to - * GFP_HIGHUSER page cache allocations and - * kswapd tries to balance zones to their high - * watermark. As a result, neither should be - * regarded as dirtyable memory, to prevent a - * situation where reclaim has to clean pages - * in order to balance the zones. - */ - zone->dirty_balance_reserve = max; } } - dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } @@ -6584,8 +6652,9 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. It means you can't - * expect this function should be exact. + * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable + * check without lock_page also may miss some movable non-lru pages at + * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages) @@ -6641,6 +6710,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (skip_hwpoisoned_pages && PageHWPoison(page)) continue; + if (__PageMovable(page)) + continue; + if (!PageLRU(page)) found++; /* @@ -6810,6 +6882,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) return ret; + cc.zone->cma_alloc = 1; + ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) goto done; @@ -6868,6 +6942,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, done: undo_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype); + cc.zone->cma_alloc = 0; return ret; } diff --git a/mm/page_ext.c b/mm/page_ext.c index 4d1eac0d4fc5..f02ad1cc7d24 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -54,9 +54,6 @@ static struct page_ext_operations *page_ext_ops[] = { &debug_guardpage_ops, -#ifdef CONFIG_PAGE_POISONING - &page_poisoning_ops, -#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif @@ -111,6 +108,9 @@ struct page_ext *lookup_page_ext(struct page *page) * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -183,6 +183,9 @@ struct page_ext *lookup_page_ext(struct page *page) * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 00c96462cc36..efb6c3c38c01 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -7,6 +7,8 @@ #include <linux/pageblock-flags.h> #include <linux/memory.h> #include <linux/hugetlb.h> +#include <linux/kasan.h> +#include <linux/page_owner.h> #include "internal.h" static int set_migratetype_isolate(struct page *page, @@ -105,8 +107,6 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (pfn_valid_within(page_to_pfn(buddy)) && !is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); - kernel_map_pages(page, (1 << order), 1); - set_page_refcounted(page); isolated_page = page; } } @@ -125,8 +125,10 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); - if (isolated_page) + if (isolated_page) { + post_alloc_hook(page, order, __GFP_MOVABLE); __free_pages(isolated_page, order); + } } static inline struct page * diff --git a/mm/page_owner.c b/mm/page_owner.c index dd6b9cebf981..10b7f196b005 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -5,10 +5,24 @@ #include <linux/bootmem.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> +#include <linux/jump_label.h> +#include <linux/migrate.h> +#include <linux/stackdepot.h> + #include "internal.h" -static bool page_owner_disabled = true; -bool page_owner_inited __read_mostly; +/* + * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) + * to use off stack temporal storage + */ +#define PAGE_OWNER_STACK_DEPTH (16) + +static bool page_owner_disabled = + !IS_ENABLED(CONFIG_PAGE_OWNER_ENABLE_DEFAULT); +DEFINE_STATIC_KEY_FALSE(page_owner_inited); + +static depot_stack_handle_t dummy_handle; +static depot_stack_handle_t failure_handle; static void init_early_allocated_pages(void); @@ -20,6 +34,9 @@ static int early_page_owner_param(char *buf) if (strcmp(buf, "on") == 0) page_owner_disabled = false; + if (strcmp(buf, "off") == 0) + page_owner_disabled = true; + return 0; } early_param("page_owner", early_page_owner_param); @@ -32,12 +49,42 @@ static bool need_page_owner(void) return true; } +static noinline void register_dummy_stack(void) +{ + unsigned long entries[4]; + struct stack_trace dummy; + + dummy.nr_entries = 0; + dummy.max_entries = ARRAY_SIZE(entries); + dummy.entries = &entries[0]; + dummy.skip = 0; + + save_stack_trace(&dummy); + dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); +} + +static noinline void register_failure_stack(void) +{ + unsigned long entries[4]; + struct stack_trace failure; + + failure.nr_entries = 0; + failure.max_entries = ARRAY_SIZE(entries); + failure.entries = &entries[0]; + failure.skip = 0; + + save_stack_trace(&failure); + failure_handle = depot_save_stack(&failure, GFP_KERNEL); +} + static void init_page_owner(void) { if (page_owner_disabled) return; - page_owner_inited = true; + register_dummy_stack(); + register_failure_stack(); + static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -59,52 +106,135 @@ void __reset_page_owner(struct page *page, unsigned int order) } } -void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) +static inline bool check_recursive_alloc(struct stack_trace *trace, + unsigned long ip) { - struct page_ext *page_ext = lookup_page_ext(page); + int i, count; + + if (!trace->nr_entries) + return false; + + for (i = 0, count = 0; i < trace->nr_entries; i++) { + if (trace->entries[i] == ip && ++count == 2) + return true; + } + + return false; +} +static noinline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { .nr_entries = 0, - .max_entries = ARRAY_SIZE(page_ext->trace_entries), - .entries = &page_ext->trace_entries[0], - .skip = 3, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 2 }; + depot_stack_handle_t handle; + + save_stack_trace(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + /* + * We need to check recursion here because our request to stackdepot + * could trigger memory allocation to save new entry. New memory + * allocation would reach here and call depot_save_stack() again + * if we don't catch it. There is still not enough memory in stackdepot + * so it would try to allocate memory again and loop forever. + */ + if (check_recursive_alloc(&trace, _RET_IP_)) + return dummy_handle; + + handle = depot_save_stack(&trace, flags); + if (!handle) + handle = failure_handle; + + return handle; +} + +noinline void __set_page_owner(struct page *page, unsigned int order, + gfp_t gfp_mask) +{ + struct page_ext *page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) return; - save_stack_trace(&trace); - + page_ext->handle = save_stack(gfp_mask); page_ext->order = order; page_ext->gfp_mask = gfp_mask; - page_ext->nr_entries = trace.nr_entries; + page_ext->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } -gfp_t __get_page_owner_gfp(struct page *page) +void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = lookup_page_ext(page); if (unlikely(!page_ext)) + return; + + page_ext->last_migrate_reason = reason; +} + +void __split_page_owner(struct page *page, unsigned int order) +{ + int i; + struct page_ext *page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) /* - * The caller just returns 0 if no valid gfp - * So return 0 here too. + * The caller just returns if no valid gfp + * So return here too. */ - return 0; + return; + + page_ext->order = 0; + for (i = 1; i < (1 << order); i++) + __copy_page_owner(page, page + i); +} + +void __copy_page_owner(struct page *oldpage, struct page *newpage) +{ + struct page_ext *old_ext = lookup_page_ext(oldpage); + struct page_ext *new_ext = lookup_page_ext(newpage); + + if (unlikely(!old_ext || !new_ext)) + return; - return page_ext->gfp_mask; + new_ext->order = old_ext->order; + new_ext->gfp_mask = old_ext->gfp_mask; + new_ext->last_migrate_reason = old_ext->last_migrate_reason; + new_ext->handle = old_ext->handle; + + /* + * We don't clear the bit on the oldpage as it's going to be freed + * after migration. Until then, the info can be useful in case of + * a bug, and the overal stats will be off a bit only temporarily. + * Also, migrate_misplaced_transhuge_page() can still fail the + * migration and then we want the oldpage to retain the info. But + * in that case we also don't need to explicitly clear the info from + * the new page, which will be freed. + */ + __set_bit(PAGE_EXT_OWNER, &new_ext->flags); } static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, - struct page *page, struct page_ext *page_ext) + struct page *page, struct page_ext *page_ext, + depot_stack_handle_t handle) { int ret; int pageblock_mt, page_mt; char *kbuf; + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; struct stack_trace trace = { - .nr_entries = page_ext->nr_entries, - .entries = &page_ext->trace_entries[0], + .nr_entries = 0, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 0 }; kbuf = kmalloc(count, GFP_KERNEL); @@ -112,8 +242,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask 0x%x\n", - page_ext->order, page_ext->gfp_mask); + "Page allocated via order %u, mask %#x(%pGg)\n", + page_ext->order, page_ext->gfp_mask, + &page_ext->gfp_mask); if (ret >= count) goto err; @@ -122,31 +253,29 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, pageblock_mt = get_pfnblock_migratetype(page, pfn); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, - "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, + migratetype_names[page_mt], pfn >> pageblock_order, - pageblock_mt, - pageblock_mt != page_mt ? "Fallback" : " ", - PageLocked(page) ? "K" : " ", - PageError(page) ? "E" : " ", - PageReferenced(page) ? "R" : " ", - PageUptodate(page) ? "U" : " ", - PageDirty(page) ? "D" : " ", - PageLRU(page) ? "L" : " ", - PageActive(page) ? "A" : " ", - PageSlab(page) ? "S" : " ", - PageWriteback(page) ? "W" : " ", - PageCompound(page) ? "C" : " ", - PageSwapCache(page) ? "B" : " ", - PageMappedToDisk(page) ? "M" : " "); + migratetype_names[pageblock_mt], + page->flags, &page->flags); if (ret >= count) goto err; + depot_fetch_stack(handle, &trace); ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); if (ret >= count) goto err; + if (page_ext->last_migrate_reason != -1) { + ret += snprintf(kbuf + ret, count - ret, + "Page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); + if (ret >= count) + goto err; + } + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; @@ -162,14 +291,58 @@ err: return -ENOMEM; } +void __dump_page_owner(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; + struct stack_trace trace = { + .nr_entries = 0, + .entries = entries, + .max_entries = PAGE_OWNER_STACK_DEPTH, + .skip = 0 + }; + depot_stack_handle_t handle; + gfp_t gfp_mask; + int mt; + + if (unlikely(!page_ext)) { + pr_alert("There is not page extension available.\n"); + return; + } + gfp_mask = page_ext->gfp_mask; + mt = gfpflags_to_migratetype(gfp_mask); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + handle = READ_ONCE(page_ext->handle); + if (!handle) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + depot_fetch_stack(handle, &trace); + pr_alert("page allocated via order %u, migratetype %s, " + "gfp_mask %#x(%pGg)\n", page_ext->order, + migratetype_names[mt], gfp_mask, &gfp_mask); + print_stack_trace(&trace, 0); + + if (page_ext->last_migrate_reason != -1) + pr_alert("page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); +} + static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long pfn; struct page *page; struct page_ext *page_ext; + depot_stack_handle_t handle; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; @@ -216,10 +389,19 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; + /* + * Access to page_ext->handle isn't synchronous so we should + * be careful to access it. + */ + handle = READ_ONCE(page_ext->handle); + if (!handle) + continue; + /* Record the next PFN to read in the file offset */ *ppos = (pfn - min_low_pfn) + 1; - return print_page_owner(buf, count, pfn, page, page_ext); + return print_page_owner(buf, count, pfn, page, + page_ext, handle); } return 0; @@ -258,6 +440,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + /* * We are safe to check buddy flag and order, because * this is init stage and only single thread runs. @@ -321,7 +506,7 @@ static int __init pageowner_init(void) { struct dentry *dentry; - if (!page_owner_inited) { + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } diff --git a/mm/page_poison.c b/mm/page_poison.c new file mode 100644 index 000000000000..c8cf230dbfcb --- /dev/null +++ b/mm/page_poison.c @@ -0,0 +1,134 @@ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/page_ext.h> +#include <linux/poison.h> +#include <linux/ratelimit.h> + +static bool want_page_poisoning __read_mostly + = IS_ENABLED(CONFIG_PAGE_POISONING_ENABLE_DEFAULT); + +static int early_page_poison_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + want_page_poisoning = true; + else if (strcmp(buf, "off") == 0) + want_page_poisoning = false; + + return 0; +} +early_param("page_poison", early_page_poison_param); + +bool page_poisoning_enabled(void) +{ + /* + * Assumes that debug_pagealloc_enabled is set before + * free_all_bootmem. + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + return (want_page_poisoning || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())); +} + +static void poison_page(struct page *page) +{ + void *addr = kmap_atomic(page); + + memset(addr, PAGE_POISON, PAGE_SIZE); + kunmap_atomic(addr); +} + +static void poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(struct page *page, + unsigned char *mem, size_t bytes) +{ + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); + unsigned char *start; + unsigned char *end; + + if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) + return; + + start = memchr_inv(mem, PAGE_POISON, bytes); + if (!start) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!__ratelimit(&ratelimit)) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + pr_err("pagealloc: single bit error on page with phys start 0x%lx\n", + (unsigned long)page_to_phys(page)); + else + pr_err("pagealloc: memory corruption on page with phys start 0x%lx\n", + (unsigned long)page_to_phys(page)); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + BUG_ON(PANIC_CORRUPTION); + dump_stack(); +} + +static void unpoison_page(struct page *page) +{ + void *addr; + + addr = kmap_atomic(page); + /* + * Page poisoning when enabled poisons each and every page + * that is freed to buddy. Thus no extra check is done to + * see if a page was posioned. + */ + check_poison_mem(page, addr, PAGE_SIZE); + kunmap_atomic(addr); +} + +static void unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +void kernel_poison_pages(struct page *page, int numpages, int enable) +{ + if (!page_poisoning_enabled()) + return; + + if (enable) + unpoison_pages(page, numpages); + else + poison_pages(page, numpages); +} + +#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + /* This function does nothing, all work is done via poison pages */ +} +#endif diff --git a/mm/process_reclaim.c b/mm/process_reclaim.c new file mode 100644 index 000000000000..98e5af190fe0 --- /dev/null +++ b/mm/process_reclaim.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/sort.h> +#include <linux/oom.h> +#include <linux/sched.h> +#include <linux/rcupdate.h> +#include <linux/notifier.h> +#include <linux/vmpressure.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/process_reclaim.h> + +#define MAX_SWAP_TASKS SWAP_CLUSTER_MAX + +static void swap_fn(struct work_struct *work); +DECLARE_WORK(swap_work, swap_fn); + +/* User knob to enable/disable process reclaim feature */ +static int enable_process_reclaim; +module_param_named(enable_process_reclaim, enable_process_reclaim, int, + S_IRUGO | S_IWUSR); + +/* The max number of pages tried to be reclaimed in a single run */ +int per_swap_size = SWAP_CLUSTER_MAX * 32; +module_param_named(per_swap_size, per_swap_size, int, S_IRUGO | S_IWUSR); + +int reclaim_avg_efficiency; +module_param_named(reclaim_avg_efficiency, reclaim_avg_efficiency, + int, S_IRUGO); + +/* The vmpressure region where process reclaim operates */ +static unsigned long pressure_min = 50; +static unsigned long pressure_max = 90; +module_param_named(pressure_min, pressure_min, ulong, S_IRUGO | S_IWUSR); +module_param_named(pressure_max, pressure_max, ulong, S_IRUGO | S_IWUSR); + +static short min_score_adj = 360; +module_param_named(min_score_adj, min_score_adj, short, + S_IRUGO | S_IWUSR); + +/* + * Scheduling process reclaim workqueue unecessarily + * when the reclaim efficiency is low does not make + * sense. We try to detect a drop in efficiency and + * disable reclaim for a time period. This period and the + * period for which we monitor a drop in efficiency is + * defined by swap_eff_win. swap_opt_eff is the optimal + * efficincy used as theshold for this. + */ +static int swap_eff_win = 2; +module_param_named(swap_eff_win, swap_eff_win, int, S_IRUGO | S_IWUSR); + +static int swap_opt_eff = 50; +module_param_named(swap_opt_eff, swap_opt_eff, int, S_IRUGO | S_IWUSR); + +static atomic_t skip_reclaim = ATOMIC_INIT(0); +/* Not atomic since only a single instance of swap_fn run at a time */ +static int monitor_eff; + +struct selected_task { + struct task_struct *p; + int tasksize; + short oom_score_adj; +}; + +int selected_cmp(const void *a, const void *b) +{ + const struct selected_task *x = a; + const struct selected_task *y = b; + int ret; + + ret = x->tasksize < y->tasksize ? -1 : 1; + + return ret; +} + +static int test_task_flag(struct task_struct *p, int flag) +{ + struct task_struct *t = p; + + rcu_read_lock(); + for_each_thread(p, t) { + task_lock(t); + if (test_tsk_thread_flag(t, flag)) { + task_unlock(t); + rcu_read_unlock(); + return 1; + } + task_unlock(t); + } + rcu_read_unlock(); + + return 0; +} + +static void swap_fn(struct work_struct *work) +{ + struct task_struct *tsk; + struct reclaim_param rp; + + /* Pick the best MAX_SWAP_TASKS tasks in terms of anon size */ + struct selected_task selected[MAX_SWAP_TASKS] = {{0, 0, 0},}; + int si = 0; + int i; + int tasksize; + int total_sz = 0; + int total_scan = 0; + int total_reclaimed = 0; + int nr_to_reclaim; + int efficiency; + + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *p; + short oom_score_adj; + + if (tsk->flags & PF_KTHREAD) + continue; + + if (test_task_flag(tsk, TIF_MEMDIE)) + continue; + + p = find_lock_task_mm(tsk); + if (!p) + continue; + + oom_score_adj = p->signal->oom_score_adj; + if (oom_score_adj < min_score_adj) { + task_unlock(p); + continue; + } + + tasksize = get_mm_counter(p->mm, MM_ANONPAGES); + task_unlock(p); + + if (tasksize <= 0) + continue; + + if (si == MAX_SWAP_TASKS) { + sort(&selected[0], MAX_SWAP_TASKS, + sizeof(struct selected_task), + &selected_cmp, NULL); + if (tasksize < selected[0].tasksize) + continue; + selected[0].p = p; + selected[0].oom_score_adj = oom_score_adj; + selected[0].tasksize = tasksize; + } else { + selected[si].p = p; + selected[si].oom_score_adj = oom_score_adj; + selected[si].tasksize = tasksize; + si++; + } + } + + for (i = 0; i < si; i++) + total_sz += selected[i].tasksize; + + /* Skip reclaim if total size is too less */ + if (total_sz < SWAP_CLUSTER_MAX) { + rcu_read_unlock(); + return; + } + + for (i = 0; i < si; i++) + get_task_struct(selected[i].p); + + rcu_read_unlock(); + + while (si--) { + nr_to_reclaim = + (selected[si].tasksize * per_swap_size) / total_sz; + /* scan atleast a page */ + if (!nr_to_reclaim) + nr_to_reclaim = 1; + + rp = reclaim_task_anon(selected[si].p, nr_to_reclaim); + + trace_process_reclaim(selected[si].tasksize, + selected[si].oom_score_adj, rp.nr_scanned, + rp.nr_reclaimed, per_swap_size, total_sz, + nr_to_reclaim); + total_scan += rp.nr_scanned; + total_reclaimed += rp.nr_reclaimed; + put_task_struct(selected[si].p); + } + + if (total_scan) { + efficiency = (total_reclaimed * 100) / total_scan; + + if (efficiency < swap_opt_eff) { + if (++monitor_eff == swap_eff_win) { + atomic_set(&skip_reclaim, swap_eff_win); + monitor_eff = 0; + } + } else { + monitor_eff = 0; + } + + reclaim_avg_efficiency = + (efficiency + reclaim_avg_efficiency) / 2; + trace_process_reclaim_eff(efficiency, reclaim_avg_efficiency); + } +} + +static int vmpressure_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long pressure = action; + + if (!enable_process_reclaim) + return 0; + + if (!current_is_kswapd()) + return 0; + + if (atomic_dec_if_positive(&skip_reclaim) >= 0) + return 0; + + if ((pressure >= pressure_min) && (pressure < pressure_max)) + if (!work_pending(&swap_work)) + queue_work(system_unbound_wq, &swap_work); + return 0; +} + +static struct notifier_block vmpr_nb = { + .notifier_call = vmpressure_notifier, +}; + +static int __init process_reclaim_init(void) +{ + vmpressure_notifier_register(&vmpr_nb); + return 0; +} + +static void __exit process_reclaim_exit(void) +{ + vmpressure_notifier_unregister(&vmpr_nb); +} + +module_init(process_reclaim_init); +module_exit(process_reclaim_exit); diff --git a/mm/readahead.c b/mm/readahead.c index ba22d7fe0afb..72c17e77a6c7 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -234,6 +234,8 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, /* * Set the initial window size, round to next power of 2 and square + * Small size is not dependant on max value - only a one-page read is regarded + * as small. * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-8 page = 32k initial, > 8 page = 128k initial @@ -242,7 +244,7 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); - if (newsize <= max / 32) + if (newsize <= 1) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; diff --git a/mm/rmap.c b/mm/rmap.c index 1bceb49aa214..effcea83ac4e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1481,9 +1481,12 @@ static int page_not_mapped(struct page *page) * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped * @flags: action and flags + * @vma : target vma for reclaim * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. + * If @vma is not NULL, this function try to remove @page from only @vma + * without peeking all mapped vma for @page. * Return values are: * * SWAP_SUCCESS - we succeeded in removing all mappings @@ -1491,7 +1494,8 @@ static int page_not_mapped(struct page *page) * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. */ -int try_to_unmap(struct page *page, enum ttu_flags flags) +int try_to_unmap(struct page *page, enum ttu_flags flags, + struct vm_area_struct *vma) { int ret; struct rmap_walk_control rwc = { @@ -1499,6 +1503,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) .arg = (void *)flags, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, + .target_vma = vma, }; VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); @@ -1544,6 +1549,7 @@ int try_to_munlock(struct page *page) .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, + .target_vma = NULL, }; @@ -1605,6 +1611,11 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; + if (rwc->target_vma) { + unsigned long address = vma_address(page, rwc->target_vma); + return rwc->rmap_one(page, rwc->target_vma, address, rwc->arg); + } + anon_vma = rmap_walk_anon_lock(page, rwc); if (!anon_vma) return ret; @@ -1647,6 +1658,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) struct address_space *mapping = page->mapping; pgoff_t pgoff; struct vm_area_struct *vma; + unsigned long address; int ret = SWAP_AGAIN; /* @@ -1662,6 +1674,12 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) pgoff = page_to_pgoff(page); i_mmap_lock_read(mapping); + if (rwc->target_vma) { + address = vma_address(page, rwc->target_vma); + ret = rwc->rmap_one(page, rwc->target_vma, address, rwc->arg); + goto done; + } + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); diff --git a/mm/showmem.c b/mm/showmem.c new file mode 100644 index 000000000000..1103a02b2cbd --- /dev/null +++ b/mm/showmem.c @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2014-2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/notifier.h> +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/init.h> + +ATOMIC_NOTIFIER_HEAD(show_mem_notifier); + +int show_mem_notifier_register(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&show_mem_notifier, nb); +} + +int show_mem_notifier_unregister(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&show_mem_notifier, nb); +} + +void show_mem_call_notifiers(void) +{ + atomic_notifier_call_chain(&show_mem_notifier, 0, NULL); +} + +static int show_mem_notifier_get(void *dat, u64 *val) +{ + show_mem_call_notifiers(); + *val = 0; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(show_mem_notifier_debug_ops, show_mem_notifier_get, + NULL, "%llu\n"); + +int show_mem_notifier_debugfs_register(void) +{ + debugfs_create_file("show_mem_notifier", 0664, NULL, NULL, + &show_mem_notifier_debug_ops); + + return 0; +} +late_initcall(show_mem_notifier_debugfs_register); diff --git a/mm/slub.c b/mm/slub.c index 06bed117ce90..fd3a044aaa4a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -685,11 +685,21 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) dump_stack(); } +#ifdef CONFIG_SLUB_DEBUG_PANIC_ON +static void slab_panic(const char *cause) +{ + panic("%s\n", cause); +} +#else +static inline void slab_panic(const char *cause) {} +#endif + void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { slab_bug(s, "%s", reason); print_trailer(s, page, object); + slab_panic(reason); } static void slab_err(struct kmem_cache *s, struct page *page, @@ -704,6 +714,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, slab_bug(s, "%s", buf); print_page_info(page); dump_stack(); + slab_panic("slab error"); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -725,6 +736,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { + slab_panic("object poison overwritten"); slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); memset(from, data, to - from); } @@ -1577,6 +1589,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; + kasan_alloc_pages(page, order); __free_kmem_pages(page, order); } @@ -3743,6 +3756,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); + kasan_alloc_pages(page, compound_order(page)); __free_kmem_pages(page, compound_order(page)); return; } diff --git a/mm/swap_ratio.c b/mm/swap_ratio.c new file mode 100644 index 000000000000..cf2a6e2ae135 --- /dev/null +++ b/mm/swap_ratio.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/mm_types.h> +#include <linux/swapfile.h> +#include <linux/swap.h> + +#define SWAP_RATIO_GROUP_START (SWAP_FLAG_PRIO_MASK - 9) /* 32758 */ +#define SWAP_RATIO_GROUP_END (SWAP_FLAG_PRIO_MASK) /* 32767 */ +#define SWAP_FAST_WRITES (SWAPFILE_CLUSTER * (SWAP_CLUSTER_MAX / 8)) +#define SWAP_SLOW_WRITES SWAPFILE_CLUSTER + +/* + * The fast/slow swap write ratio. + * 100 indicates that all writes should + * go to fast swap device. + */ +int sysctl_swap_ratio = 100; + +/* Enable the swap ratio feature */ +int sysctl_swap_ratio_enable; + +static bool is_same_group(struct swap_info_struct *a, + struct swap_info_struct *b) +{ + if (!sysctl_swap_ratio_enable) + return false; + + if (!is_swap_ratio_group(a->prio)) + return false; + + if (a->prio == b->prio) + return true; + + return false; +} + +/* Caller must hold swap_avail_lock */ +static int calculate_write_pending(struct swap_info_struct *si, + struct swap_info_struct *n) +{ + int ratio = sysctl_swap_ratio; + + if ((ratio < 0) || (ratio > 100)) + return -EINVAL; + + if (WARN_ON(!(si->flags & SWP_FAST))) + return -ENODEV; + + if ((n->flags & SWP_FAST) || !is_same_group(si, n)) + return -ENODEV; + + si->max_writes = ratio ? SWAP_FAST_WRITES : 0; + n->max_writes = ratio ? (SWAP_FAST_WRITES * 100) / + ratio - SWAP_FAST_WRITES : SWAP_SLOW_WRITES; + + si->write_pending = si->max_writes; + n->write_pending = n->max_writes; + + trace_printk("%u, %u\n", si->max_writes, n->max_writes); + + return 0; +} + +static int swap_ratio_slow(struct swap_info_struct **si) +{ + struct swap_info_struct *n = NULL; + int ret = 0; + + spin_lock(&(*si)->lock); + spin_lock(&swap_avail_lock); + if (&(*si)->avail_list == plist_last(&swap_avail_head)) { + /* just to make skip work */ + n = *si; + ret = -ENODEV; + goto skip; + } + n = plist_next_entry(&(*si)->avail_list, + struct swap_info_struct, + avail_list); + if (n == *si) { + /* No other swap device */ + ret = -ENODEV; + goto skip; + } + + spin_unlock(&swap_avail_lock); + spin_lock(&n->lock); + spin_lock(&swap_avail_lock); + + if ((*si)->flags & SWP_FAST) { + if ((*si)->write_pending) { + (*si)->write_pending--; + goto exit; + } else { + if ((n->flags & SWP_FAST) || !is_same_group(*si, n)) { + /* Should never happen */ + ret = -ENODEV; + } else if (n->write_pending) { + /* + * Requeue fast device, since there are pending + * writes for slow device. + */ + plist_requeue(&(*si)->avail_list, + &swap_avail_head); + n->write_pending--; + spin_unlock(&(*si)->lock); + *si = n; + goto skip; + } else { + if (0 > calculate_write_pending(*si, n)) { + ret = -ENODEV; + goto exit; + } + /* Restart from fast device */ + (*si)->write_pending--; + } + } + } else { + if (!(n->flags & SWP_FAST) || !is_same_group(*si, n)) { + /* Should never happen */ + ret = -ENODEV; + } else if (n->write_pending) { + /* + * Pending writes for fast device. + * We reach here when slow device is swapped on first, + * before fast device. + */ + /* requeue slow device to the end */ + plist_requeue(&(*si)->avail_list, &swap_avail_head); + n->write_pending--; + spin_unlock(&(*si)->lock); + *si = n; + goto skip; + } else { + if ((*si)->write_pending) { + (*si)->write_pending--; + } else { + if (0 > calculate_write_pending(n, *si)) { + ret = -ENODEV; + goto exit; + } + n->write_pending--; + plist_requeue(&(*si)->avail_list, + &swap_avail_head); + spin_unlock(&(*si)->lock); + *si = n; + goto skip; + } + } + } +exit: + spin_unlock(&(*si)->lock); +skip: + spin_unlock(&swap_avail_lock); + /* n and si would have got interchanged */ + spin_unlock(&n->lock); + return ret; +} + +bool is_swap_ratio_group(int prio) +{ + return ((prio >= SWAP_RATIO_GROUP_START) && + (prio <= SWAP_RATIO_GROUP_END)) ? true : false; +} + +void setup_swap_ratio(struct swap_info_struct *p, int prio) +{ + /* Used only if sysctl_swap_ratio_enable is set */ + if (is_swap_ratio_group(prio)) { + if (p->flags & SWP_FAST) + p->write_pending = SWAP_FAST_WRITES; + else + p->write_pending = SWAP_SLOW_WRITES; + p->max_writes = p->write_pending; + } +} + +int swap_ratio(struct swap_info_struct **si) +{ + if (!sysctl_swap_ratio_enable) + return -ENODEV; + + if (is_swap_ratio_group((*si)->prio)) + return swap_ratio_slow(si); + else + return -ENODEV; +} diff --git a/mm/swap_state.c b/mm/swap_state.c index d504adb7fa5f..61039e39e25f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -96,6 +96,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) if (likely(!error)) { address_space->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SWAPCACHE); INC_CACHE_INFO(add_total); } spin_unlock_irq(&address_space->tree_lock); @@ -148,6 +149,7 @@ void __delete_from_swap_cache(struct page *page) ClearPageSwapCache(page); address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + __dec_zone_page_state(page, NR_SWAPCACHE); INC_CACHE_INFO(del_total); } @@ -472,7 +474,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long mask; struct blk_plug plug; - mask = swapin_nr_pages(offset) - 1; + mask = is_swap_fast(entry) ? 0 : swapin_nr_pages(offset) - 1; if (!mask) goto skip; diff --git a/mm/swapfile.c b/mm/swapfile.c index 65e07eb6558b..fca04a53cf78 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -75,8 +75,8 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static PLIST_HEAD(swap_avail_head); -static DEFINE_SPINLOCK(swap_avail_lock); +PLIST_HEAD(swap_avail_head); +DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -91,6 +91,26 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ } +bool is_swap_fast(swp_entry_t entry) +{ + struct swap_info_struct *p; + unsigned long type; + + if (non_swap_entry(entry)) + return false; + + type = swp_type(entry); + if (type >= nr_swapfiles) + return false; + + p = swap_info[type]; + + if (p->flags & SWP_FAST) + return true; + + return false; +} + /* returns 1 if swap entry is freed */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) @@ -193,7 +213,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -#define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 static inline void cluster_set_flag(struct swap_cluster_info *info, @@ -564,7 +583,7 @@ checks: scan_base = offset = si->lowest_bit; /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); @@ -604,7 +623,8 @@ scan: spin_lock(&si->lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && + si->swap_map[offset] == SWAP_HAS_CACHE) { spin_lock(&si->lock); goto checks; } @@ -619,7 +639,8 @@ scan: spin_lock(&si->lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && + si->swap_map[offset] == SWAP_HAS_CACHE) { spin_lock(&si->lock); goto checks; } @@ -640,18 +661,39 @@ swp_entry_t get_swap_page(void) { struct swap_info_struct *si, *next; pgoff_t offset; + int swap_ratio_off = 0; if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; atomic_long_dec(&nr_swap_pages); +lock_and_start: spin_lock(&swap_avail_lock); start_over: plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + + if (sysctl_swap_ratio && !swap_ratio_off) { + int ret; + + spin_unlock(&swap_avail_lock); + ret = swap_ratio(&si); + if (0 > ret) { + /* + * Error. Start again with swap + * ratio disabled. + */ + swap_ratio_off = 1; + goto lock_and_start; + } else { + goto start; + } + } + /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); +start: spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); @@ -1008,7 +1050,8 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || vm_swap_full())) { + (!page_mapped(page) || + vm_swap_full(page_swap_info(page)))) { delete_from_swap_cache(page); SetPageDirty(page); } @@ -2535,11 +2578,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } + if (p->bdev && blk_queue_fast(bdev_get_queue(p->bdev))) + p->flags |= SWP_FAST; + mutex_lock(&swapon_mutex); prio = -1; - if (swap_flags & SWAP_FLAG_PREFER) + if (swap_flags & SWAP_FLAG_PREFER) { prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + setup_swap_ratio(p, prio); + } enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", diff --git a/mm/truncate.c b/mm/truncate.c index f4c8270f7b84..8ca20a98c327 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -63,6 +63,171 @@ unlock: spin_unlock_irq(&mapping->tree_lock); } +static void do_truncate_inode_pages_range(struct address_space *mapping, + loff_t lstart, loff_t lend, bool fill_zero) +{ + pgoff_t start; /* inclusive */ + pgoff_t end; /* exclusive */ + unsigned int partial_start; /* inclusive */ + unsigned int partial_end; /* exclusive */ + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t index; + int i; + + cleancache_invalidate_inode(mapping); + if (mapping->nrpages == 0 && mapping->nrshadows == 0) + return; + + /* Offsets within partial pages */ + partial_start = lstart & (PAGE_CACHE_SIZE - 1); + partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + + /* + * 'start' and 'end' always covers the range of pages to be fully + * truncated. Partial pages are covered with 'partial_start' at the + * start of the range and 'partial_end' at the end of the range. + * Note that 'end' is exclusive while 'lend' is inclusive. + */ + start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (lend == -1) + /* + * lend == -1 indicates end-of-file so we have to set 'end' + * to the highest possible pgoff_t and since the type is + * unsigned we're using -1. + */ + end = -1; + else + end = (lend + 1) >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + index = start; + + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ + index = indices[i]; + if (index >= end) + break; + + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); + if (PageWriteback(page)) { + unlock_page(page); + continue; + } + truncate_inode_page(mapping, page); + if (fill_zero) + zero_user(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + cond_resched(); + index++; + } + + if (partial_start) { + struct page *page = find_lock_page(mapping, start - 1); + + if (page) { + unsigned int top = PAGE_CACHE_SIZE; + + if (start > end) { + /* Truncation within a single page */ + top = partial_end; + partial_end = 0; + } + wait_on_page_writeback(page); + zero_user_segment(page, partial_start, top); + cleancache_invalidate_page(mapping, page); + if (page_has_private(page)) + do_invalidatepage(page, partial_start, + top - partial_start); + unlock_page(page); + page_cache_release(page); + } + } + if (partial_end) { + struct page *page = find_lock_page(mapping, end); + + if (page) { + wait_on_page_writeback(page); + zero_user_segment(page, 0, partial_end); + cleancache_invalidate_page(mapping, page); + if (page_has_private(page)) + do_invalidatepage(page, 0, + partial_end); + unlock_page(page); + page_cache_release(page); + } + } + /* + * If the truncation happened within a single page no pages + * will be released, just zeroed, so we can bail out now. + */ + if (start >= end) + return; + + index = start; + for ( ; ; ) { + cond_resched(); + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* If all gone from start onwards, we're done */ + if (index == start) + break; + /* Otherwise restart to make sure all gone */ + index = start; + continue; + } + if (index == start && indices[0] >= end) { + /* All gone out of hole to be punched, we're done */ + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + break; + } + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + /* We rely upon deletion not changing page->index */ + index = indices[i]; + if (index >= end) { + /* Restart punch to make sure all gone */ + index = start - 1; + break; + } + + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + + lock_page(page); + WARN_ON(page->index != index); + wait_on_page_writeback(page); + truncate_inode_page(mapping, page); + if (fill_zero) + zero_user(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + index++; + } + cleancache_invalidate_inode(mapping); +} + /** * do_invalidatepage - invalidate part or all of a page * @page: the page which is affected @@ -218,162 +383,43 @@ int invalidate_inode_page(struct page *page) void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - pgoff_t start; /* inclusive */ - pgoff_t end; /* exclusive */ - unsigned int partial_start; /* inclusive */ - unsigned int partial_end; /* exclusive */ - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; - pgoff_t index; - int i; - - cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0 && mapping->nrshadows == 0) - return; - - /* Offsets within partial pages */ - partial_start = lstart & (PAGE_CACHE_SIZE - 1); - partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); - - /* - * 'start' and 'end' always covers the range of pages to be fully - * truncated. Partial pages are covered with 'partial_start' at the - * start of the range and 'partial_end' at the end of the range. - * Note that 'end' is exclusive while 'lend' is inclusive. - */ - start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (lend == -1) - /* - * lend == -1 indicates end-of-file so we have to set 'end' - * to the highest possible pgoff_t and since the type is - * unsigned we're using -1. - */ - end = -1; - else - end = (lend + 1) >> PAGE_CACHE_SHIFT; - - pagevec_init(&pvec, 0); - index = start; - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - - /* We rely upon deletion not changing page->index */ - index = indices[i]; - if (index >= end) - break; - - if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); - continue; - } - - if (!trylock_page(page)) - continue; - WARN_ON(page->index != index); - if (PageWriteback(page)) { - unlock_page(page); - continue; - } - truncate_inode_page(mapping, page); - unlock_page(page); - } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - cond_resched(); - index++; - } - - if (partial_start) { - struct page *page = find_lock_page(mapping, start - 1); - if (page) { - unsigned int top = PAGE_CACHE_SIZE; - if (start > end) { - /* Truncation within a single page */ - top = partial_end; - partial_end = 0; - } - wait_on_page_writeback(page); - zero_user_segment(page, partial_start, top); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, partial_start, - top - partial_start); - unlock_page(page); - page_cache_release(page); - } - } - if (partial_end) { - struct page *page = find_lock_page(mapping, end); - if (page) { - wait_on_page_writeback(page); - zero_user_segment(page, 0, partial_end); - cleancache_invalidate_page(mapping, page); - if (page_has_private(page)) - do_invalidatepage(page, 0, - partial_end); - unlock_page(page); - page_cache_release(page); - } - } - /* - * If the truncation happened within a single page no pages - * will be released, just zeroed, so we can bail out now. - */ - if (start >= end) - return; - - index = start; - for ( ; ; ) { - cond_resched(); - if (!pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { - /* If all gone from start onwards, we're done */ - if (index == start) - break; - /* Otherwise restart to make sure all gone */ - index = start; - continue; - } - if (index == start && indices[0] >= end) { - /* All gone out of hole to be punched, we're done */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - break; - } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - - /* We rely upon deletion not changing page->index */ - index = indices[i]; - if (index >= end) { - /* Restart punch to make sure all gone */ - index = start - 1; - break; - } - - if (radix_tree_exceptional_entry(page)) { - clear_exceptional_entry(mapping, index, page); - continue; - } - - lock_page(page); - WARN_ON(page->index != index); - wait_on_page_writeback(page); - truncate_inode_page(mapping, page); - unlock_page(page); - } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - index++; - } - cleancache_invalidate_inode(mapping); + do_truncate_inode_pages_range(mapping, lstart, lend, false); } EXPORT_SYMBOL(truncate_inode_pages_range); /** + * truncate_inode_pages_range_fill_zero - truncate range of pages specified by start & + * end byte offsets and zero them out + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * @lend: offset to which to truncate (inclusive) + * + * Truncate the page cache, removing the pages that are between + * specified offsets (and zeroing out partial pages + * if lstart or lend + 1 is not page aligned). + * + * Truncate takes two passes - the first pass is nonblocking. It will not + * block on page locks and it will not block on writeback. The second pass + * will wait. This is to prevent as much IO as possible in the affected region. + * The first pass will remove most pages, so the search cost of the second pass + * is low. + * + * We pass down the cache-hot hint to the page freeing code. Even if the + * mapping is large, it is probably the case that the final pages are the most + * recently touched, and freeing happens in ascending file offset order. + * + * Note that since ->invalidatepage() accepts range to invalidate + * truncate_inode_pages_range is able to handle cases where lend + 1 is not + * page aligned properly. + */ +void truncate_inode_pages_range_fill_zero(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + do_truncate_inode_pages_range(mapping, lstart, lend, true); +} +EXPORT_SYMBOL(truncate_inode_pages_range_fill_zero); + +/** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate @@ -392,6 +438,27 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) EXPORT_SYMBOL(truncate_inode_pages); /** + * truncate_inode_pages_fill_zero - truncate *all* the pages from an offset + * and zero them out + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Called under (and serialised by) inode->i_mutex. + * + * Note: When this function returns, there can be a page in the process of + * deletion (inside __delete_from_page_cache()) in the specified range. Thus + * mapping->nrpages can be non-zero when this function returns even after + * truncation of the whole mapping. + */ +void truncate_inode_pages_fill_zero(struct address_space *mapping, + loff_t lstart) +{ + truncate_inode_pages_range_fill_zero(mapping, lstart, (loff_t)-1); +} +EXPORT_SYMBOL(truncate_inode_pages_fill_zero); + + +/** * truncate_inode_pages_final - truncate *all* pages before inode dies * @mapping: mapping to truncate * diff --git a/mm/util.c b/mm/util.c index 2ab16a248776..9fa1aaab23d6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -368,9 +368,10 @@ struct address_space *page_mapping(struct page *page) } mapping = (unsigned long)page->mapping; - if (mapping & PAGE_MAPPING_FLAGS) + if ((unsigned long)mapping & PAGE_MAPPING_ANON) return NULL; - return page->mapping; + + return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); } EXPORT_SYMBOL(page_mapping); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 77f2f0f501d1..32e83c8bd087 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ -#define VM_LAZY_FREE 0x01 -#define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); +static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -291,6 +290,57 @@ static unsigned long cached_align; static unsigned long vmap_area_pcpu_hole; +#ifdef CONFIG_ENABLE_VMALLOC_SAVING +#define POSSIBLE_VMALLOC_START PAGE_OFFSET + +#define VMALLOC_BITMAP_SIZE ((VMALLOC_END - PAGE_OFFSET) >> \ + PAGE_SHIFT) +#define VMALLOC_TO_BIT(addr) ((addr - PAGE_OFFSET) >> PAGE_SHIFT) +#define BIT_TO_VMALLOC(i) (PAGE_OFFSET + i * PAGE_SIZE) + +unsigned long total_vmalloc_size; +unsigned long vmalloc_reserved; + +DECLARE_BITMAP(possible_areas, VMALLOC_BITMAP_SIZE); + +void mark_vmalloc_reserved_area(void *x, unsigned long size) +{ + unsigned long addr = (unsigned long)x; + + bitmap_set(possible_areas, VMALLOC_TO_BIT(addr), size >> PAGE_SHIFT); + vmalloc_reserved += size; +} + +int is_vmalloc_addr(const void *x) +{ + unsigned long addr = (unsigned long)x; + + if (addr < POSSIBLE_VMALLOC_START || addr >= VMALLOC_END) + return 0; + + if (test_bit(VMALLOC_TO_BIT(addr), possible_areas)) + return 0; + + return 1; +} + +static void calc_total_vmalloc_size(void) +{ + total_vmalloc_size = VMALLOC_END - POSSIBLE_VMALLOC_START - + vmalloc_reserved; +} +#else +int is_vmalloc_addr(const void *x) +{ + unsigned long addr = (unsigned long)x; + + return addr >= VMALLOC_START && addr < VMALLOC_END; +} + +static void calc_total_vmalloc_size(void) { } +#endif +EXPORT_SYMBOL(is_vmalloc_addr); + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -363,6 +413,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); + might_sleep(); + va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) @@ -577,6 +629,13 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* + * Serialize vmap purging. There is no actual criticial section protected + * by this look, but we want to avoid concurrent calls for performance + * reasons and to make the pcpu_get_vm_areas more deterministic. + */ +static DEFINE_MUTEX(vmap_purge_lock); + /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -591,65 +650,40 @@ void set_iounmap_nonlazy(void) /* * Purges all lazily-freed vmap areas. - * - * If sync is 0 then don't purge if there is already a purge in progress. - * If force_flush is 1, then flush kernel TLBs between *start and *end even - * if we found no lazy vmap areas to unmap (callers can use this to optimise - * their own TLB flushing). - * Returns with *start = min(*start, lowest purged address) - * *end = max(*end, highest purged address) */ -static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, - int sync, int force_flush) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { - static DEFINE_SPINLOCK(purge_lock); - LIST_HEAD(valist); + struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - int nr = 0; + bool do_free = false; - /* - * If sync is 0 but force_flush is 1, we'll go sync anyway but callers - * should not expect such behaviour. This just simplifies locking for - * the case that isn't actually used at the moment anyway. - */ - if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) - return; - } else - spin_lock(&purge_lock); + lockdep_assert_held(&vmap_purge_lock); - if (sync) - purge_fragmented_blocks_allcpus(); - - rcu_read_lock(); - list_for_each_entry_rcu(va, &vmap_area_list, list) { - if (va->flags & VM_LAZY_FREE) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - list_add_tail(&va->purge_list, &valist); - va->flags |= VM_LAZY_FREEING; - va->flags &= ~VM_LAZY_FREE; - } + valist = llist_del_all(&vmap_purge_list); + llist_for_each_entry(va, valist, purge_list) { + if (va->va_start < start) + start = va->va_start; + if (va->va_end > end) + end = va->va_end; + do_free = true; } - rcu_read_unlock(); - if (nr) - atomic_sub(nr, &vmap_lazy_nr); + if (!do_free) + return false; - if (nr || force_flush) - flush_tlb_kernel_range(*start, *end); + flush_tlb_kernel_range(start, end); - if (nr) { - spin_lock(&vmap_area_lock); - list_for_each_entry_safe(va, n_va, &valist, purge_list) - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_lock(&vmap_area_lock); + llist_for_each_entry_safe(va, n_va, valist, purge_list) { + int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + + __free_vmap_area(va); + atomic_sub(nr, &vmap_lazy_nr); + cond_resched_lock(&vmap_area_lock); } - spin_unlock(&purge_lock); + spin_unlock(&vmap_area_lock); + return true; } /* @@ -658,9 +692,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, */ static void try_purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 0, 0); + if (mutex_trylock(&vmap_purge_lock)) { + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + } } /* @@ -668,9 +703,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 1, 0); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); } /* @@ -680,20 +716,16 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - va->flags |= VM_LAZY_FREE; - atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); - if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) - try_purge_vmap_area_lazy(); -} + int nr_lazy; -/* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. - */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) -{ - unmap_vmap_area(va); - free_vmap_area_noflush(va); + nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, + &vmap_lazy_nr); + + /* After this point, we may free va at any time */ + llist_add(&va->purge_list, &vmap_purge_list); + + if (unlikely(nr_lazy > lazy_max_pages())) + try_purge_vmap_area_lazy(); } /* @@ -702,7 +734,8 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - free_unmap_vmap_area_noflush(va); + unmap_vmap_area(va); + free_vmap_area_noflush(va); } static struct vmap_area *find_vmap_area(unsigned long addr) @@ -716,16 +749,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) return va; } -static void free_unmap_vmap_area_addr(unsigned long addr) -{ - struct vmap_area *va; - - va = find_vmap_area(addr); - BUG_ON(!va); - free_unmap_vmap_area(va); -} - - /*** Per cpu kva allocator ***/ /* @@ -1046,6 +1069,8 @@ void vm_unmap_aliases(void) if (unlikely(!vmap_initialized)) return; + might_sleep(); + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -1070,7 +1095,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - __purge_vmap_area_lazy(&start, &end, 1, flush); + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + if (!__purge_vmap_area_lazy(start, end) && flush) + flush_tlb_kernel_range(start, end); + mutex_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); @@ -1083,7 +1112,9 @@ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; + struct vmap_area *va; + might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); @@ -1092,10 +1123,14 @@ void vm_unmap_ram(const void *mem, unsigned int count) debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) + if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); - else - free_unmap_vmap_area_addr(addr); + return; + } + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1144,6 +1179,33 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro EXPORT_SYMBOL(vm_map_ram); static struct vm_struct *vmlist __initdata; + +/** + * vm_area_check_early - check if vmap area is already mapped + * @vm: vm_struct to be checked + * + * This function is used to check if the vmap area has been + * mapped already. @vm->addr, @vm->size and @vm->flags should + * contain proper values. + * + */ +int __init vm_area_check_early(struct vm_struct *vm) +{ + struct vm_struct *tmp, **p; + + BUG_ON(vmap_initialized); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) { + if (tmp->addr < vm->addr + vm->size) + return 1; + } else { + if (tmp->addr + tmp->size > vm->addr) + return 1; + } + } + return 0; +} + /** * vm_area_add_early - add vmap area early during boot * @vm: vm_struct to add @@ -1224,7 +1286,7 @@ void __init vmalloc_init(void) } vmap_area_pcpu_hole = VMALLOC_END; - + calc_total_vmalloc_size(); vmap_initialized = true; } @@ -1388,16 +1450,27 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { +#ifdef CONFIG_ENABLE_VMALLOC_SAVING + return __get_vm_area_node(size, 1, flags, PAGE_OFFSET, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); +#else return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); +#endif } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { +#ifdef CONFIG_ENABLE_VMALLOC_SAVING + return __get_vm_area_node(size, 1, flags, PAGE_OFFSET, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, caller); +#else return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); +#endif } /** @@ -1431,6 +1504,8 @@ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; + might_sleep(); + va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; @@ -1489,7 +1564,39 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + +static inline void __vfree_deferred(const void *addr) +{ + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * nother cpu's list. schedule_work() should be fine with this too. + */ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} + +/** + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address + * + * This one is just like vfree() but can be called in any atomic context + * except NMIs. + */ +void vfree_atomic(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + __vfree_deferred(addr); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1512,11 +1619,9 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) { - struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); - } else + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); @@ -2654,6 +2759,9 @@ static int s_show(struct seq_file *m, void *p) if (v->flags & VM_VPAGES) seq_puts(m, " vpages"); + if (v->flags & VM_LOWMEM) + seq_puts(m, " lowmem"); + show_numa_info(m, v); seq_putc(m, '\n'); return 0; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 3fb15c25af87..f5383e43597a 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -22,6 +22,9 @@ #include <linux/slab.h> #include <linux/swap.h> #include <linux/printk.h> +#include <linux/notifier.h> +#include <linux/init.h> +#include <linux/module.h> #include <linux/vmpressure.h> /* @@ -38,7 +41,7 @@ * TODO: Make the window size depend on machine size, as we do for vmstat * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). */ -static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; +static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; /* * These thresholds are used when we account memory pressure through @@ -49,6 +52,33 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; static const unsigned int vmpressure_level_med = 60; static const unsigned int vmpressure_level_critical = 95; +static unsigned long vmpressure_scale_max = 100; +module_param_named(vmpressure_scale_max, vmpressure_scale_max, + ulong, S_IRUGO | S_IWUSR); + +/* vmpressure values >= this will be scaled based on allocstalls */ +static unsigned long allocstall_threshold = 70; +module_param_named(allocstall_threshold, allocstall_threshold, + ulong, S_IRUGO | S_IWUSR); + +static struct vmpressure global_vmpressure; +BLOCKING_NOTIFIER_HEAD(vmpressure_notifier); + +int vmpressure_notifier_register(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&vmpressure_notifier, nb); +} + +int vmpressure_notifier_unregister(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&vmpressure_notifier, nb); +} + +void vmpressure_notify(unsigned long pressure) +{ + blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL); +} + /* * When there are too little pages left to scan, vmpressure() may miss the * critical pressure as number of pages will be less than "window size". @@ -75,6 +105,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) return container_of(work, struct vmpressure, work); } +#ifdef CONFIG_MEMCG static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); @@ -85,6 +116,12 @@ static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) return NULL; return memcg_to_vmpressure(memcg); } +#else +static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) +{ + return NULL; +} +#endif enum vmpressure_levels { VMPRESSURE_LOW = 0, @@ -108,7 +145,7 @@ static enum vmpressure_levels vmpressure_level(unsigned long pressure) return VMPRESSURE_LOW; } -static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, +static unsigned long vmpressure_calc_pressure(unsigned long scanned, unsigned long reclaimed) { unsigned long scale = scanned + reclaimed; @@ -135,7 +172,20 @@ out: pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, scanned, reclaimed); - return vmpressure_level(pressure); + return pressure; +} + +static unsigned long vmpressure_account_stall(unsigned long pressure, + unsigned long stall, unsigned long scanned) +{ + unsigned long scale; + + if (pressure < allocstall_threshold) + return pressure; + + scale = ((vmpressure_scale_max - pressure) * stall) / scanned; + + return pressure + scale; } struct vmpressure_event { @@ -149,9 +199,11 @@ static bool vmpressure_event(struct vmpressure *vmpr, { struct vmpressure_event *ev; enum vmpressure_levels level; + unsigned long pressure; bool signalled = false; - level = vmpressure_calc_level(scanned, reclaimed); + pressure = vmpressure_calc_pressure(scanned, reclaimed); + level = vmpressure_level(pressure); mutex_lock(&vmpr->events_lock); @@ -203,24 +255,13 @@ static void vmpressure_work_fn(struct work_struct *work) } while ((vmpr = vmpressure_parent(vmpr))); } -/** - * vmpressure() - Account memory pressure through scanned/reclaimed ratio - * @gfp: reclaimer's gfp mask - * @memcg: cgroup memory controller handle - * @scanned: number of pages scanned - * @reclaimed: number of pages reclaimed - * - * This function should be called from the vmscan reclaim path to account - * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw - * pressure index is then further refined and averaged over time. - * - * This function does not return any value. - */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + BUG_ON(!vmpr); + /* * Here we only want to account pressure that userland is able to * help us with. For example, suppose that DMA zone is under @@ -257,6 +298,94 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, schedule_work(&vmpr->work); } +void calculate_vmpressure_win(void) +{ + long x; + + x = global_page_state(NR_FILE_PAGES) - + global_page_state(NR_SHMEM) - + total_swapcache_pages() + + global_page_state(NR_FREE_PAGES); + if (x < 1) + x = 1; + /* + * For low (free + cached), vmpressure window should be + * small, and high for higher values of (free + cached). + * But it should not be linear as well. This ensures + * timely vmpressure notifications when system is under + * memory pressure, and optimal number of events when + * cached is high. The sqaure root function is empirically + * found to serve the purpose. + */ + x = int_sqrt(x); + vmpressure_win = x; +} + +void vmpressure_global(gfp_t gfp, unsigned long scanned, + unsigned long reclaimed) +{ + struct vmpressure *vmpr = &global_vmpressure; + unsigned long pressure; + unsigned long stall; + + if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) + return; + + if (!scanned) + return; + + spin_lock(&vmpr->sr_lock); + if (!vmpr->scanned) + calculate_vmpressure_win(); + + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; + + if (!current_is_kswapd()) + vmpr->stall += scanned; + + stall = vmpr->stall; + scanned = vmpr->scanned; + reclaimed = vmpr->reclaimed; + spin_unlock(&vmpr->sr_lock); + + if (scanned < vmpressure_win) + return; + + spin_lock(&vmpr->sr_lock); + vmpr->scanned = 0; + vmpr->reclaimed = 0; + vmpr->stall = 0; + spin_unlock(&vmpr->sr_lock); + + pressure = vmpressure_calc_pressure(scanned, reclaimed); + pressure = vmpressure_account_stall(pressure, stall, scanned); + vmpressure_notify(pressure); +} + +/** + * vmpressure() - Account memory pressure through scanned/reclaimed ratio + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @scanned: number of pages scanned + * @reclaimed: number of pages reclaimed + * + * This function should be called from the vmscan reclaim path to account + * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw + * pressure index is then further refined and averaged over time. + * + * This function does not return any value. + */ +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed) +{ + if (!memcg) + vmpressure_global(gfp, scanned, reclaimed); + + if (IS_ENABLED(CONFIG_MEMCG)) + vmpressure_memcg(gfp, memcg, scanned, reclaimed); +} + /** * vmpressure_prio() - Account memory pressure through reclaimer priority level * @gfp: reclaimer's gfp mask @@ -308,6 +437,8 @@ int vmpressure_register_event(struct mem_cgroup *memcg, struct vmpressure_event *ev; int level; + BUG_ON(!vmpr); + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { if (!strcmp(vmpressure_str_levels[level], args)) break; @@ -347,6 +478,8 @@ void vmpressure_unregister_event(struct mem_cgroup *memcg, struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; + BUG_ON(!vmpr); + mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { if (ev->efd != eventfd) @@ -388,3 +521,10 @@ void vmpressure_cleanup(struct vmpressure *vmpr) */ flush_work(&vmpr->work); } + +int vmpressure_global_init(void) +{ + vmpressure_init(&global_vmpressure); + return 0; +} +late_initcall(vmpressure_global_init); diff --git a/mm/vmscan.c b/mm/vmscan.c index 930f7c67a9c1..67da9446135d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -104,6 +104,13 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + + /* + * Reclaim pages from a vma. If the page is shared by other tasks + * it is zapped from a vma without reclaim so it ends up remaining + * on memory until last task zap it. + */ + struct vm_area_struct *target_vma; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -146,6 +153,12 @@ int vm_swappiness = 60; */ unsigned long vm_total_pages; +#ifdef CONFIG_KSWAPD_CPU_AFFINITY_MASK +char *kswapd_cpu_mask = CONFIG_KSWAPD_CPU_AFFINITY_MASK; +#else +char *kswapd_cpu_mask = NULL; +#endif + static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -281,6 +294,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; + long min_cache_size = batch_size; + + if (current_is_kswapd()) + min_cache_size = 0; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) @@ -348,7 +365,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * scanning at high prio and therefore should try to reclaim as much as * possible. */ - while (total_scan >= batch_size || + while (total_scan > min_cache_size || total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); @@ -385,6 +402,35 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, return freed; } +static void shrink_slab_lmk(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + unsigned long nr_scanned, + unsigned long nr_eligible) +{ + struct shrinker *shrinker; + + if (nr_scanned == 0) + nr_scanned = SWAP_CLUSTER_MAX; + + if (!down_read_trylock(&shrinker_rwsem)) + goto out; + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + }; + + if (!(shrinker->flags & SHRINKER_LMK)) + continue; + + do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + } + + up_read(&shrinker_rwsem); +out: + cond_resched(); +} + /** * shrink_slab - shrink slab caches * @gfp_mask: allocation context @@ -446,6 +492,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; + if (shrinker->flags & SHRINKER_LMK) + continue; + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; @@ -915,7 +964,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct address_space *mapping; struct page *page; int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; + enum page_references references = PAGEREF_RECLAIM; bool dirty, writeback; cond_resched(); @@ -927,7 +976,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON_PAGE(PageActive(page), page); - VM_BUG_ON_PAGE(page_zone(page) != zone, page); + if (zone) + VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -1006,7 +1056,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - test_bit(ZONE_WRITEBACK, &zone->flags)) { + (zone && test_bit(ZONE_WRITEBACK, &zone->flags))) { nr_immediate++; goto keep_locked; @@ -1072,7 +1122,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page) && mapping) { switch (try_to_unmap(page, - ttu_flags|TTU_BATCH_FLUSH)) { + ttu_flags|TTU_BATCH_FLUSH, + sc->target_vma)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1092,7 +1143,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !test_bit(ZONE_DIRTY, &zone->flags))) { + (zone && + !test_bit(ZONE_DIRTY, &zone->flags)))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -1204,6 +1256,13 @@ free_it: * appear not as the counts should be low */ list_add(&page->lru, &free_pages); + /* + * If pagelist are from multiple zones, we should decrease + * NR_ISOLATED_ANON + x on freed pages in here. + */ + if (!zone) + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); continue; cull_mlocked: @@ -1215,7 +1274,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page) && vm_swap_full(page_swap_info(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); @@ -1249,6 +1308,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, .may_unmap = 1, + /* Doesn't allow to write out dirty page */ + .may_writepage = 0, }; unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; struct page *page, *next; @@ -1256,7 +1317,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, list_for_each_entry_safe(page, next, page_list, lru) { if (page_is_file_cache(page) && !PageDirty(page) && - !isolated_balloon_page(page)) { + !__PageMovable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } @@ -1270,6 +1331,42 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, return ret; } +#ifdef CONFIG_PROCESS_RECLAIM +unsigned long reclaim_pages_from_list(struct list_head *page_list, + struct vm_area_struct *vma) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .target_vma = vma, + }; + + unsigned long nr_reclaimed; + struct page *page; + unsigned long dummy1, dummy2, dummy3, dummy4, dummy5; + + list_for_each_entry(page, page_list, lru) + ClearPageActive(page); + + nr_reclaimed = shrink_page_list(page_list, NULL, &sc, + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + + while (!list_empty(page_list)) { + page = lru_to_page(page_list); + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + + return nr_reclaimed; +} +#endif + /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -1456,6 +1553,44 @@ int isolate_lru_page(struct page *page) return ret; } +static int __too_many_isolated(struct zone *zone, int file, + struct scan_control *sc, int safe) +{ + unsigned long inactive, isolated; + + if (file) { + if (safe) { + inactive = zone_page_state_snapshot(zone, + NR_INACTIVE_FILE); + isolated = zone_page_state_snapshot(zone, + NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } + } else { + if (safe) { + inactive = zone_page_state_snapshot(zone, + NR_INACTIVE_ANON); + isolated = zone_page_state_snapshot(zone, + NR_ISOLATED_ANON); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + } + + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) + inactive >>= 3; + + return isolated > inactive; +} + /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and * then get resheduled. When there are massive number of tasks doing page @@ -1464,33 +1599,22 @@ int isolate_lru_page(struct page *page) * unnecessary swapping, thrashing and OOM. */ static int too_many_isolated(struct zone *zone, int file, - struct scan_control *sc) + struct scan_control *sc, int safe) { - unsigned long inactive, isolated; - if (current_is_kswapd()) return 0; if (!sane_reclaim(sc)) return 0; - if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); - } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); + if (unlikely(__too_many_isolated(zone, file, sc, 0))) { + if (safe) + return __too_many_isolated(zone, file, sc, safe); + else + return 1; } - /* - * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they - * won't get blocked by normal direct-reclaimers, forming a circular - * deadlock. - */ - if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) - inactive >>= 3; - - return isolated > inactive; + return 0; } static noinline_for_stack void @@ -1506,6 +1630,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) while (!list_empty(page_list)) { struct page *page = lru_to_page(page_list); int lru; + int file; VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); @@ -1522,8 +1647,11 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) lru = page_lru(page); add_page_to_lru_list(page, lruvec, lru); + file = is_file_lru(lru); + if (IS_ENABLED(CONFIG_ZCACHE)) + if (file) + SetPageWasActive(page); if (is_active_lru(lru)) { - int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; } @@ -1580,15 +1708,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); + int safe = 0; struct zone *zone = lruvec_zone(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - while (unlikely(too_many_isolated(zone, file, sc))) { + while (unlikely(too_many_isolated(zone, file, sc, safe))) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) return SWAP_CLUSTER_MAX; + + safe = 1; } lru_add_drain(); @@ -1845,6 +1976,12 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + if (IS_ENABLED(CONFIG_ZCACHE)) + /* + * For zcache to know whether the page is from active + * file list + */ + SetPageWasActive(page); list_add(&page->lru, &l_inactive); } @@ -2060,7 +2197,8 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * There is enough inactive page cache, do not reclaim * anything from the anonymous working set right now. */ - if (!inactive_file_is_low(lruvec)) { + if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) && + !inactive_file_is_low(lruvec)) { scan_balance = SCAN_FILE; goto out; } @@ -2433,15 +2571,23 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, sc->nr_scanned - nr_scanned, zone_lru_pages); + /* + * Record the subtree's reclaim efficiency. The reclaimed + * pages from slab is excluded here because the corresponding + * scanned pages is not accounted. Moreover, freeing a page + * by slab shrinking depends on each slab's object population, + * making the cost model (i.e. scan:free) different from that + * of LRU. + */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); - if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; @@ -2515,6 +2661,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) gfp_t orig_mask; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); bool reclaimable = false; + unsigned long lru_pages = 0; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2542,6 +2689,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * to global LRU. */ if (global_reclaim(sc)) { + lru_pages += zone_reclaimable_pages(zone); if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue; @@ -2592,6 +2740,9 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) reclaimable = true; } + if (global_reclaim(sc)) + shrink_slab_lmk(sc->gfp_mask, 0, NULL, + sc->nr_scanned, lru_pages); /* * Restore to original mask to avoid the impact on the caller if we * promoted it to __GFP_HIGHMEM. @@ -2950,18 +3101,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, bool highorder, + unsigned long balance_gap, int classzone_idx) { - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx)) - return false; + unsigned long mark = high_wmark_pages(zone) + balance_gap; - if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, - order, 0, classzone_idx) == COMPACT_SKIPPED) - return false; + /* + * When checking from pgdat_balanced(), kswapd should stop and sleep + * when it reaches the high order-0 watermark and let kcompactd take + * over. Other callers such as wakeup_kswapd() want to determine the + * true high-order watermark. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { + mark += (1UL << order); + order = 0; + } - return true; + return zone_watermark_ok_safe(zone, order, mark, classzone_idx); } /* @@ -3011,7 +3167,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) continue; } - if (zone_balanced(zone, order, 0, i)) + if (zone_balanced(zone, order, false, 0, i)) balanced_pages += zone->managed_pages; else if (!order) return false; @@ -3066,9 +3222,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, struct scan_control *sc, - unsigned long *nr_attempted) + unsigned long lru_pages) { - int testorder = sc->order; unsigned long balance_gap; bool lowmem_pressure; @@ -3076,17 +3231,6 @@ static bool kswapd_shrink_zone(struct zone *zone, sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); /* - * Kswapd reclaims only single pages with compaction enabled. Trying - * too hard to reclaim until contiguous free pages have become - * available can hurt performance by evicting too much useful data - * from memory. Do not reclaim more than needed for compaction. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order, 0, classzone_idx) - != COMPACT_SKIPPED) - testorder = 0; - - /* * We put equal pressure on every zone, unless one zone has way too * many pages free already. The "too many pages" is defined as the * high wmark plus a "gap" where the gap is either the low @@ -3100,14 +3244,13 @@ static bool kswapd_shrink_zone(struct zone *zone, * reclaim is necessary */ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, testorder, + if (!lowmem_pressure && zone_balanced(zone, sc->order, false, balance_gap, classzone_idx)) return true; shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - - /* Account for the number of pages attempted to reclaim */ - *nr_attempted += sc->nr_to_reclaim; + shrink_slab_lmk(sc->gfp_mask, zone_to_nid(zone), NULL, + sc->nr_scanned, lru_pages); clear_bit(ZONE_WRITEBACK, &zone->flags); @@ -3118,7 +3261,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * waits. */ if (zone_reclaimable(zone) && - zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_balanced(zone, sc->order, false, 0, classzone_idx)) { clear_bit(ZONE_CONGESTED, &zone->flags); clear_bit(ZONE_DIRTY, &zone->flags); } @@ -3130,7 +3273,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the final order kswapd was reclaiming at + * Returns the highest zone idx kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -3147,8 +3290,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int *classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -3165,9 +3307,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long nr_attempted = 0; bool raise_priority = true; - bool pgdat_needs_compaction = (order > 0); + unsigned long lru_pages = 0; sc.nr_reclaimed = 0; @@ -3202,7 +3343,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, break; } - if (!zone_balanced(zone, order, 0, 0)) { + if (!zone_balanced(zone, order, false, 0, 0)) { end_zone = i; break; } else { @@ -3218,32 +3359,23 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (i < 0) goto out; + /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) continue; - /* - * If any zone is currently balanced then kswapd will - * not call compaction as it is expected that the - * necessary pages are already available. - */ - if (pgdat_needs_compaction && - zone_watermark_ok(zone, order, - low_wmark_pages(zone), - *classzone_idx, 0)) - pgdat_needs_compaction = false; + lru_pages += zone_reclaimable_pages(zone); } /* - * If we're getting trouble reclaiming, start doing writepage - * even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - - /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -3279,8 +3411,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, - &sc, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, &sc, lru_pages)) raise_priority = false; } @@ -3293,49 +3424,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pfmemalloc_watermark_ok(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); - /* - * Fragmentation may mean that the system cannot be rebalanced - * for high-order allocations in all zones. If twice the - * allocation size has been reclaimed and the zones are still - * not balanced then recheck the watermarks at order-0 to - * prevent kswapd reclaiming excessively. Assume that a - * process requested a high-order can direct reclaim/compact. - */ - if (order && sc.nr_reclaimed >= 2UL << order) - order = sc.order = 0; - /* Check if kswapd should be suspending */ if (try_to_freeze() || kthread_should_stop()) break; /* - * Compact if necessary and kswapd is reclaiming at least the - * high watermark number of pages as requsted - */ - if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) - compact_pgdat(pgdat, order); - - /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, *classzone_idx)); + !pgdat_balanced(pgdat, order, classzone_idx)); out: /* - * Return the order we were reclaiming at so prepare_kswapd_sleep() - * makes a decision on the order we were last reclaiming at. However, - * if another caller entered the allocator slow path while kswapd - * was awake, order will remain at the higher level + * Return the highest zone idx we were reclaiming at so + * prepare_kswapd_sleep() makes the same decisions as here. */ - *classzone_idx = end_zone; - return order; + return end_zone; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, + int classzone_idx, int balanced_classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3346,7 +3457,22 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, order, classzone_idx); + remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3356,7 +3482,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3369,14 +3496,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - /* - * Compaction records what page blocks it recently failed to - * isolate pages from and skips them in the future scanning. - * When kswapd is going to sleep, it is reasonable to assume - * that pages and compaction may succeed so reset the cache. - */ - reset_isolation_suitable(pgdat); - if (!kthread_should_stop()) schedule(); @@ -3406,7 +3525,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; - unsigned balanced_order; int classzone_idx, new_classzone_idx; int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; @@ -3419,7 +3537,7 @@ static int kswapd(void *p) lockdep_set_current_reclaim_state(GFP_KERNEL); - if (!cpumask_empty(cpumask)) + if (kswapd_cpu_mask == NULL && !cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; @@ -3439,24 +3557,19 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; - balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { bool ret; /* - * If the last balance_pgdat was unsuccessful it's unlikely a - * new request of a similar or harder type will succeed soon - * so consider going to sleep on the basis we reclaimed at + * While we were reclaiming, there might have been another + * wakeup, so check the values. */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; if (order < new_order || classzone_idx > new_classzone_idx) { /* @@ -3466,7 +3579,7 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, balanced_order, + kswapd_try_to_sleep(pgdat, order, classzone_idx, balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; @@ -3486,9 +3599,8 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = classzone_idx; - balanced_order = balance_pgdat(pgdat, order, - &balanced_classzone_idx); + balanced_classzone_idx = balance_pgdat(pgdat, order, + classzone_idx); } } @@ -3518,7 +3630,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, 0, 0)) + if (zone_balanced(zone, order, true, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); @@ -3589,6 +3701,22 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action, return NOTIFY_OK; } +static int set_kswapd_cpu_mask(pg_data_t *pgdat) +{ + int ret = 0; + cpumask_t tmask; + + if (!kswapd_cpu_mask) + return 0; + + cpumask_clear(&tmask); + ret = cpumask_parse(kswapd_cpu_mask, &tmask); + if (ret) + return ret; + + return set_cpus_allowed_ptr(pgdat->kswapd, &tmask); +} + /* * This kswapd start function will be called by init and node-hot-add. * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. @@ -3608,6 +3736,9 @@ int kswapd_run(int nid) pr_err("Failed to start kswapd on node %d\n", nid); ret = PTR_ERR(pgdat->kswapd); pgdat->kswapd = NULL; + } else if (kswapd_cpu_mask) { + if (set_kswapd_cpu_mask(pgdat)) + pr_warn("error setting kswapd cpu affinity mask\n"); } return ret; } @@ -3633,7 +3764,8 @@ static int __init kswapd_init(void) swap_setup(); for_each_node_state(nid, N_MEMORY) kswapd_run(nid); - hotcpu_notifier(cpu_callback, 0); + if (kswapd_cpu_mask == NULL) + hotcpu_notifier(cpu_callback, 0); return 0; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 8640a185dfc6..6c841595b963 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -764,6 +764,7 @@ const char * const vmstat_text[] = { "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", + "nr_swapcache", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -773,6 +774,7 @@ const char * const vmstat_text[] = { /* enum vm_event_item counters */ "pgpgin", "pgpgout", + "pgpgoutclean", "pswpin", "pswpout", @@ -826,6 +828,7 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", + "compact_daemon_wake", #endif #ifdef CONFIG_HUGETLB_PAGE @@ -904,6 +907,7 @@ static void frag_stop(struct seq_file *m, void *arg) /* Walk all the zones in a node and print using a callback */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + bool nolock, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -914,27 +918,16 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, if (!populated_zone(zone)) continue; - spin_lock_irqsave(&zone->lock, flags); + if (!nolock) + spin_lock_irqsave(&zone->lock, flags); print(m, pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); + if (!nolock) + spin_unlock_irqrestore(&zone->lock, flags); } } #endif #ifdef CONFIG_PROC_FS -static char * const migratetype_names[MIGRATE_TYPES] = { - "Unmovable", - "Movable", - "Reclaimable", - "HighAtomic", -#ifdef CONFIG_CMA - "CMA", -#endif -#ifdef CONFIG_MEMORY_ISOLATION - "Isolate", -#endif -}; - static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -952,7 +945,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, frag_show_print); + walk_zones_in_node(m, pgdat, false, frag_show_print); return 0; } @@ -993,7 +986,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, false, pagetypeinfo_showfree_print); return 0; } @@ -1042,7 +1035,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, false, pagetypeinfo_showblockcount_print); return 0; } @@ -1086,7 +1079,11 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, page = pfn_to_page(pfn); if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; + unsigned long freepage_order; + + freepage_order = page_order_unsafe(page); + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; continue; } @@ -1133,7 +1130,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) #ifdef CONFIG_PAGE_OWNER int mtype; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return; drain_all_pages(NULL); @@ -1143,7 +1140,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1276,7 +1273,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); return 0; } @@ -1393,7 +1390,7 @@ static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats(true)) { + if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the @@ -1405,22 +1402,13 @@ static void vmstat_update(struct work_struct *w) } else { /* * We did not update any counters so the app may be in - * a mode where it does not cause counter updates. + * a mode where it does not cause counter updates or the cpu + * was isolated. * We may be uselessly running vmstat_update. * Defer the checking for differentials to the * shepherd thread on a different processor. */ - int r; - /* - * Shepherd work thread does not race since it never - * changes the bit if its zero but the cpu - * online / off line code may race if - * worker threads are still allowed during - * shutdown / startup. - */ - r = cpumask_test_and_set_cpu(smp_processor_id(), - cpu_stat_off); - VM_BUG_ON(r); + cpumask_set_cpu(smp_processor_id(), cpu_stat_off); } } @@ -1482,7 +1470,7 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ for_each_cpu(cpu, cpu_stat_off) - if (need_update(cpu) && + if (!cpu_isolated(cpu) && need_update(cpu) && cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) queue_delayed_work_on(cpu, vmstat_wq, @@ -1646,7 +1634,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, unusable_show_print); + walk_zones_in_node(m, pgdat, false, unusable_show_print); return 0; } @@ -1698,7 +1686,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, extfrag_show_print); + walk_zones_in_node(m, pgdat, false, extfrag_show_print); return 0; } diff --git a/mm/zbud.c b/mm/zbud.c index d8a181fd779b..09ab957e2b10 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -357,13 +357,15 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, struct zbud_header *zhdr = NULL; enum buddy bud; struct page *page; + unsigned long flags; + int found = 0; if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) return -ENOSPC; chunks = size_to_chunks(size); - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); /* First, try to find an unbuddied zbud page. */ zhdr = NULL; @@ -376,16 +378,17 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, bud = FIRST; else bud = LAST; + found = 1; goto found; } } /* Couldn't find unbuddied zbud page, create new one */ - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); page = alloc_page(gfp); if (!page) return -ENOMEM; - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); pool->pages_nr++; zhdr = init_zbud_page(page); bud = FIRST; @@ -411,7 +414,9 @@ found: list_add(&zhdr->lru, &pool->lru); *handle = encode_handle(zhdr, bud); - spin_unlock(&pool->lock); + if ((gfp & __GFP_ZERO) && found) + memset((void *)*handle, 0, size); + spin_unlock_irqrestore(&pool->lock, flags); return 0; } @@ -430,8 +435,9 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) { struct zbud_header *zhdr; int freechunks; + unsigned long flags; - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); zhdr = handle_to_zbud_header(handle); /* If first buddy, handle will be page aligned */ @@ -442,7 +448,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) if (zhdr->under_reclaim) { /* zbud page is under reclaim, reclaim will free */ - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); return; } @@ -460,7 +466,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); } - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); } #define list_tail_entry(ptr, type, member) \ @@ -505,12 +511,13 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) { int i, ret, freechunks; struct zbud_header *zhdr; + unsigned long flags; unsigned long first_handle = 0, last_handle = 0; - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || retries == 0) { - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); return -EINVAL; } for (i = 0; i < retries; i++) { @@ -529,7 +536,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) first_handle = encode_handle(zhdr, FIRST); if (zhdr->last_chunks) last_handle = encode_handle(zhdr, LAST); - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); /* Issue the eviction callback(s) */ if (first_handle) { @@ -543,7 +550,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) goto next; } next: - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); zhdr->under_reclaim = false; if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { /* @@ -552,7 +559,7 @@ next: */ free_zbud_page(zhdr); pool->pages_nr--; - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); return 0; } else if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { @@ -567,7 +574,7 @@ next: /* add to beginning of LRU */ list_add(&zhdr->lru, &pool->lru); } - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); return -EAGAIN; } diff --git a/mm/zcache.c b/mm/zcache.c new file mode 100644 index 000000000000..01473566ed0b --- /dev/null +++ b/mm/zcache.c @@ -0,0 +1,1169 @@ +/* + * linux/mm/zcache.c + * + * A cleancache backend for file pages compression. + * Concepts based on original zcache by Dan Magenheimer. + * Copyright (C) 2013 Bob Liu <bob.liu@xxxxxxxxxx> + * + * With zcache, active file pages can be compressed in memory during page + * reclaiming. When their data is needed again the I/O reading operation is + * avoided. This results in a significant performance gain under memory pressure + * for systems with many file pages. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/cleancache.h> +#include <linux/cpu.h> +#include <linux/crypto.h> +#include <linux/page-flags.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/mm_types.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/radix-tree.h> +#include <linux/rbtree.h> +#include <linux/types.h> +#include <linux/zbud.h> + +/* + * Enable/disable zcache (disabled by default) + */ +static bool zcache_enabled __read_mostly; +module_param_named(enabled, zcache_enabled, bool, 0); + +/* + * Compressor to be used by zcache + */ +#define ZCACHE_COMPRESSOR_DEFAULT "lzo" +static char *zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT; +module_param_named(compressor, zcache_compressor, charp, 0); + +/* + * The maximum percentage of memory that the compressed pool can occupy. + */ +static unsigned int zcache_max_pool_percent = 10; +module_param_named(max_pool_percent, zcache_max_pool_percent, uint, 0644); + +static unsigned int zcache_clear_percent = 4; +module_param_named(clear_percent, zcache_clear_percent, uint, 0644); +/* + * zcache statistics + */ +static u64 zcache_pool_limit_hit; +static u64 zcache_dup_entry; +static u64 zcache_zbud_alloc_fail; +static u64 zcache_evict_zpages; +static u64 zcache_evict_filepages; +static u64 zcache_inactive_pages_refused; +static u64 zcache_reclaim_fail; +static u64 zcache_pool_shrink; +static u64 zcache_pool_shrink_fail; +static u64 zcache_pool_shrink_pages; +static u64 zcache_store_failed; +static atomic_t zcache_stored_pages = ATOMIC_INIT(0); +static atomic_t zcache_stored_zero_pages = ATOMIC_INIT(0); + +#define GFP_ZCACHE \ + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NOMEMALLOC | __GFP_ZERO) + +/* + * Make sure this is different from radix tree + * indirect ptr or exceptional entry. + */ +#define ZERO_HANDLE ((void *)~(~0UL >> 1)) + +/* + * Zcache receives pages for compression through the Cleancache API and is able + * to evict pages from its own compressed pool on an LRU basis in the case that + * the compressed pool is full. + * + * Zcache makes use of zbud for the managing the compressed memory pool. Each + * allocation in zbud is not directly accessible by address. Rather, a handle + * (zaddr) is return by the allocation routine and that handle(zaddr must be + * mapped before being accessed. The compressed memory pool grows on demand and + * shrinks as compressed pages are freed. + * + * When a file page is passed from cleancache to zcache, zcache maintains a + * mapping of the <filesystem_type, inode_number, page_index> to the zbud + * address that references that compressed file page. This mapping is achieved + * with a red-black tree per filesystem type, plus a radix tree per red-black + * node. + * + * A zcache pool with pool_id as the index is created when a filesystem mounted + * Each zcache pool has a red-black tree, the inode number(rb_index) is the + * search key. Each red-black tree node has a radix tree which use + * page->index(ra_index) as the index. Each radix tree slot points to the zbud + * address combining with some extra information(zcache_ra_handle). + */ +#define MAX_ZCACHE_POOLS 32 +/* + * One zcache_pool per (cleancache aware) filesystem mount instance + */ +struct zcache_pool { + struct rb_root rbtree; + rwlock_t rb_lock; /* Protects rbtree */ + u64 size; + struct zbud_pool *pool; /* Zbud pool used */ +}; + +/* + * Manage all zcache pools + */ +struct _zcache { + struct zcache_pool *pools[MAX_ZCACHE_POOLS]; + u32 num_pools; /* Current no. of zcache pools */ + spinlock_t pool_lock; /* Protects pools[] and num_pools */ +}; +struct _zcache zcache; + +/* + * Redblack tree node, each node has a page index radix-tree. + * Indexed by inode nubmer. + */ +struct zcache_rbnode { + struct rb_node rb_node; + int rb_index; + struct radix_tree_root ratree; /* Page radix tree per inode rbtree */ + spinlock_t ra_lock; /* Protects radix tree */ + struct kref refcount; +}; + +/* + * Radix-tree leaf, indexed by page->index + */ +struct zcache_ra_handle { + int rb_index; /* Redblack tree index */ + int ra_index; /* Radix tree index */ + int zlen; /* Compressed page size */ + struct zcache_pool *zpool; /* Finding zcache_pool during evict */ +}; + +u64 zcache_pages(void) +{ + int i; + u64 count = 0; + + for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++) + count += zcache.pools[i]->size; + + return count; +} + +static struct kmem_cache *zcache_rbnode_cache; +static int zcache_rbnode_cache_create(void) +{ + zcache_rbnode_cache = KMEM_CACHE(zcache_rbnode, 0); + return zcache_rbnode_cache == NULL; +} +static void zcache_rbnode_cache_destroy(void) +{ + kmem_cache_destroy(zcache_rbnode_cache); +} + +static unsigned long zcache_count(struct shrinker *s, + struct shrink_control *sc) +{ + unsigned long active_file; + long file_gap; + + active_file = global_page_state(NR_ACTIVE_FILE); + file_gap = zcache_pages() - active_file; + if (file_gap < 0) + file_gap = 0; + return file_gap; +} + +static unsigned long zcache_scan(struct shrinker *s, struct shrink_control *sc) +{ + unsigned long active_file; + unsigned long file; + long file_gap; + unsigned long freed = 0; + unsigned long pool; + static bool running; + int i = 0; + int retries; + + if (running) + goto end; + + running = true; + active_file = global_page_state(NR_ACTIVE_FILE); + file = global_page_state(NR_FILE_PAGES); + pool = zcache_pages(); + + file_gap = pool - file; + + if ((file_gap >= 0) && + (totalram_pages * zcache_clear_percent / 100 > file)) { + file_gap = pool; + zcache_pool_shrink++; + goto reclaim; + } + + /* + * file_gap == 0 means that the number of pages + * stored by zcache is around twice as many as the + * number of active file pages. + */ + file_gap = pool - active_file; + if (file_gap < 0) + file_gap = 0; + else + zcache_pool_shrink++; + +reclaim: + retries = file_gap; + while ((file_gap > 0) && retries) { + struct zcache_pool *zpool = + zcache.pools[i++ % MAX_ZCACHE_POOLS]; + if (!zpool || !zpool->size) + continue; + if (zbud_reclaim_page(zpool->pool, 8)) { + zcache_pool_shrink_fail++; + retries--; + continue; + } + freed++; + file_gap--; + } + + zcache_pool_shrink_pages += freed; + for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++) + zcache.pools[i]->size = + zbud_get_pool_size(zcache.pools[i]->pool); + + running = false; +end: + return freed; +} + +static struct shrinker zcache_shrinker = { + .scan_objects = zcache_scan, + .count_objects = zcache_count, + .seeks = DEFAULT_SEEKS * 16 +}; + +/* + * Compression functions + * (Below functions are copyed from zswap!) + */ +static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms; + +enum comp_op { + ZCACHE_COMPOP_COMPRESS, + ZCACHE_COMPOP_DECOMPRESS +}; + +static int zcache_comp_op(enum comp_op op, const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen) +{ + struct crypto_comp *tfm; + int ret; + + tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu()); + switch (op) { + case ZCACHE_COMPOP_COMPRESS: + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); + break; + case ZCACHE_COMPOP_DECOMPRESS: + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); + break; + default: + ret = -EINVAL; + } + + put_cpu(); + return ret; +} + +static int __init zcache_comp_init(void) +{ + if (!crypto_has_comp(zcache_compressor, 0, 0)) { + pr_info("%s compressor not available\n", zcache_compressor); + /* fall back to default compressor */ + zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT; + if (!crypto_has_comp(zcache_compressor, 0, 0)) + /* can't even load the default compressor */ + return -ENODEV; + } + pr_info("using %s compressor\n", zcache_compressor); + + /* alloc percpu transforms */ + zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); + if (!zcache_comp_pcpu_tfms) + return -ENOMEM; + return 0; +} + +static void zcache_comp_exit(void) +{ + /* free percpu transforms */ + if (zcache_comp_pcpu_tfms) + free_percpu(zcache_comp_pcpu_tfms); +} + +/* + * Per-cpu code + * (Below functions are also copyed from zswap!) + */ +static DEFINE_PER_CPU(u8 *, zcache_dstmem); + +static int __zcache_cpu_notifier(unsigned long action, unsigned long cpu) +{ + struct crypto_comp *tfm; + u8 *dst; + + switch (action) { + case CPU_UP_PREPARE: + tfm = crypto_alloc_comp(zcache_compressor, 0, 0); + if (IS_ERR(tfm)) { + pr_err("can't allocate compressor transform\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm; + dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + crypto_free_comp(tfm); + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; + return NOTIFY_BAD; + } + per_cpu(zcache_dstmem, cpu) = dst; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu); + if (tfm) { + crypto_free_comp(tfm); + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; + } + dst = per_cpu(zcache_dstmem, cpu); + kfree(dst); + per_cpu(zcache_dstmem, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcache_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + + return __zcache_cpu_notifier(action, cpu); +} + +static struct notifier_block zcache_cpu_notifier_block = { + .notifier_call = zcache_cpu_notifier +}; + +static int zcache_cpu_init(void) +{ + unsigned long cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (__zcache_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) + goto cleanup; + register_cpu_notifier(&zcache_cpu_notifier_block); + put_online_cpus(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcache_cpu_notifier(CPU_UP_CANCELED, cpu); + put_online_cpus(); + return -ENOMEM; +} + +/* + * Zcache helpers + */ +static bool zcache_is_full(void) +{ + long file = global_page_state(NR_FILE_PAGES); + + return ((totalram_pages * zcache_max_pool_percent / 100 < + zcache_pages()) || + (totalram_pages * zcache_clear_percent / 100 > + file)); +} + +/* + * The caller must hold zpool->rb_lock at least + */ +static struct zcache_rbnode *zcache_find_rbnode(struct rb_root *rbtree, + int index, struct rb_node **rb_parent, struct rb_node ***rb_link) +{ + struct zcache_rbnode *entry; + struct rb_node **__rb_link, *__rb_parent, *rb_prev; + + __rb_link = &rbtree->rb_node; + rb_prev = __rb_parent = NULL; + + while (*__rb_link) { + __rb_parent = *__rb_link; + entry = rb_entry(__rb_parent, struct zcache_rbnode, rb_node); + if (entry->rb_index > index) + __rb_link = &__rb_parent->rb_left; + else if (entry->rb_index < index) { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } else + return entry; + } + + if (rb_parent) + *rb_parent = __rb_parent; + if (rb_link) + *rb_link = __rb_link; + return NULL; +} + +static struct zcache_rbnode *zcache_find_get_rbnode(struct zcache_pool *zpool, + int rb_index) +{ + unsigned long flags; + struct zcache_rbnode *rbnode; + + read_lock_irqsave(&zpool->rb_lock, flags); + rbnode = zcache_find_rbnode(&zpool->rbtree, rb_index, 0, 0); + if (rbnode) + kref_get(&rbnode->refcount); + read_unlock_irqrestore(&zpool->rb_lock, flags); + return rbnode; +} + +/* + * kref_put callback for zcache_rbnode. + * + * The rbnode must have been isolated from rbtree already. + */ +static void zcache_rbnode_release(struct kref *kref) +{ + struct zcache_rbnode *rbnode; + + rbnode = container_of(kref, struct zcache_rbnode, refcount); + BUG_ON(rbnode->ratree.rnode); + kmem_cache_free(zcache_rbnode_cache, rbnode); +} + +/* + * Check whether the radix-tree of this rbnode is empty. + * If that's true, then we can delete this zcache_rbnode from + * zcache_pool->rbtree + * + * Caller must hold zcache_rbnode->ra_lock + */ +static int zcache_rbnode_empty(struct zcache_rbnode *rbnode) +{ + return rbnode->ratree.rnode == NULL; +} + +/* + * Remove zcache_rbnode from zpool->rbtree + * + * holded_rblock - whether the caller has holded zpool->rb_lock + */ +static void zcache_rbnode_isolate(struct zcache_pool *zpool, + struct zcache_rbnode *rbnode, bool holded_rblock) +{ + unsigned long flags; + + if (!holded_rblock) + write_lock_irqsave(&zpool->rb_lock, flags); + /* + * Someone can get reference on this rbnode before we could + * acquire write lock above. + * We want to remove it from zpool->rbtree when only the caller and + * corresponding ratree holds a reference to this rbnode. + * Below check ensures that a racing zcache put will not end up adding + * a page to an isolated node and thereby losing that memory. + */ + if (atomic_read(&rbnode->refcount.refcount) == 2) { + rb_erase(&rbnode->rb_node, &zpool->rbtree); + RB_CLEAR_NODE(&rbnode->rb_node); + kref_put(&rbnode->refcount, zcache_rbnode_release); + } + if (!holded_rblock) + write_unlock_irqrestore(&zpool->rb_lock, flags); +} + +/* + * Store zaddr which allocated by zbud_alloc() to the hierarchy rbtree-ratree. + */ +static int zcache_store_zaddr(struct zcache_pool *zpool, + int ra_index, int rb_index, unsigned long zaddr) +{ + unsigned long flags; + struct zcache_rbnode *rbnode, *tmp; + struct rb_node **link = NULL, *parent = NULL; + int ret; + void *dup_zaddr; + + rbnode = zcache_find_get_rbnode(zpool, rb_index); + if (!rbnode) { + /* alloc and init a new rbnode */ + rbnode = kmem_cache_alloc(zcache_rbnode_cache, + GFP_ZCACHE); + if (!rbnode) + return -ENOMEM; + + INIT_RADIX_TREE(&rbnode->ratree, GFP_ATOMIC|__GFP_NOWARN); + spin_lock_init(&rbnode->ra_lock); + rbnode->rb_index = rb_index; + kref_init(&rbnode->refcount); + RB_CLEAR_NODE(&rbnode->rb_node); + + /* add that rbnode to rbtree */ + write_lock_irqsave(&zpool->rb_lock, flags); + tmp = zcache_find_rbnode(&zpool->rbtree, rb_index, + &parent, &link); + if (tmp) { + /* somebody else allocated new rbnode */ + kmem_cache_free(zcache_rbnode_cache, rbnode); + rbnode = tmp; + } else { + rb_link_node(&rbnode->rb_node, parent, link); + rb_insert_color(&rbnode->rb_node, &zpool->rbtree); + } + + /* Inc the reference of this zcache_rbnode */ + kref_get(&rbnode->refcount); + write_unlock_irqrestore(&zpool->rb_lock, flags); + } + + /* Succfully got a zcache_rbnode when arriving here */ + spin_lock_irqsave(&rbnode->ra_lock, flags); + dup_zaddr = radix_tree_delete(&rbnode->ratree, ra_index); + if (unlikely(dup_zaddr)) { + WARN_ON("duplicated, will be replaced!\n"); + if (dup_zaddr == ZERO_HANDLE) { + atomic_dec(&zcache_stored_zero_pages); + } else { + zbud_free(zpool->pool, (unsigned long)dup_zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + zcache_dup_entry++; + } + + /* Insert zcache_ra_handle to ratree */ + ret = radix_tree_insert(&rbnode->ratree, ra_index, + (void *)zaddr); + spin_unlock_irqrestore(&rbnode->ra_lock, flags); + if (unlikely(ret)) { + write_lock_irqsave(&zpool->rb_lock, flags); + spin_lock(&rbnode->ra_lock); + + if (zcache_rbnode_empty(rbnode)) + zcache_rbnode_isolate(zpool, rbnode, 1); + + spin_unlock(&rbnode->ra_lock); + write_unlock_irqrestore(&zpool->rb_lock, flags); + } + + kref_put(&rbnode->refcount, zcache_rbnode_release); + return ret; +} + +/* + * Load zaddr and delete it from radix tree. + * If the radix tree of the corresponding rbnode is empty, delete the rbnode + * from zpool->rbtree also. + */ +static void *zcache_load_delete_zaddr(struct zcache_pool *zpool, + int rb_index, int ra_index) +{ + struct zcache_rbnode *rbnode; + void *zaddr = NULL; + unsigned long flags; + + rbnode = zcache_find_get_rbnode(zpool, rb_index); + if (!rbnode) + goto out; + + BUG_ON(rbnode->rb_index != rb_index); + + spin_lock_irqsave(&rbnode->ra_lock, flags); + zaddr = radix_tree_delete(&rbnode->ratree, ra_index); + spin_unlock_irqrestore(&rbnode->ra_lock, flags); + + /* rb_lock and ra_lock must be taken again in the given sequence */ + write_lock_irqsave(&zpool->rb_lock, flags); + spin_lock(&rbnode->ra_lock); + if (zcache_rbnode_empty(rbnode)) + zcache_rbnode_isolate(zpool, rbnode, 1); + spin_unlock(&rbnode->ra_lock); + write_unlock_irqrestore(&zpool->rb_lock, flags); + + kref_put(&rbnode->refcount, zcache_rbnode_release); +out: + return zaddr; +} + +static bool zero_page(struct page *page) +{ + unsigned long *ptr = kmap_atomic(page); + int i; + bool ret = false; + + for (i = 0; i < PAGE_SIZE / sizeof(*ptr); i++) { + if (ptr[i]) + goto out; + } + ret = true; +out: + kunmap_atomic(ptr); + return ret; +} + +static void zcache_store_page(int pool_id, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + struct zcache_ra_handle *zhandle; + u8 *zpage, *src, *dst; + /* Address of zhandle + compressed data(zpage) */ + unsigned long zaddr = 0; + unsigned int zlen = PAGE_SIZE; + bool zero = 0; + int ret; + + struct zcache_pool *zpool = zcache.pools[pool_id]; + + /* + * Zcache will be ineffective if the compressed memory pool is full with + * compressed inactive file pages and most of them will never be used + * again. + * So we refuse to compress pages that are not from active file list. + */ + if (!PageWasActive(page)) { + zcache_inactive_pages_refused++; + return; + } + + zero = zero_page(page); + if (zero) + goto zero; + + if (zcache_is_full()) { + zcache_pool_limit_hit++; + if (zbud_reclaim_page(zpool->pool, 8)) { + zcache_reclaim_fail++; + return; + } + /* + * Continue if reclaimed a page frame succ. + */ + zcache_evict_filepages++; + zpool->size = zbud_get_pool_size(zpool->pool); + } + + /* compress */ + dst = get_cpu_var(zcache_dstmem); + src = kmap_atomic(page); + ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, src, PAGE_SIZE, dst, + &zlen); + kunmap_atomic(src); + if (ret) { + pr_err("zcache compress error ret %d\n", ret); + put_cpu_var(zcache_dstmem); + return; + } + + /* store zcache handle together with compressed page data */ + ret = zbud_alloc(zpool->pool, zlen + sizeof(struct zcache_ra_handle), + GFP_ZCACHE, &zaddr); + if (ret) { + zcache_zbud_alloc_fail++; + put_cpu_var(zcache_dstmem); + return; + } + + zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, zaddr); + + /* Compressed page data stored at the end of zcache_ra_handle */ + zpage = (u8 *)(zhandle + 1); + memcpy(zpage, dst, zlen); + zbud_unmap(zpool->pool, zaddr); + put_cpu_var(zcache_dstmem); + +zero: + if (zero) + zaddr = (unsigned long)ZERO_HANDLE; + + /* store zcache handle */ + ret = zcache_store_zaddr(zpool, index, key.u.ino, zaddr); + if (ret) { + zcache_store_failed++; + if (!zero) + zbud_free(zpool->pool, zaddr); + return; + } + + /* update stats */ + if (zero) { + atomic_inc(&zcache_stored_zero_pages); + } else { + zhandle->ra_index = index; + zhandle->rb_index = key.u.ino; + zhandle->zlen = zlen; + zhandle->zpool = zpool; + atomic_inc(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + + return; +} + +static int zcache_load_page(int pool_id, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + int ret = 0; + u8 *src, *dst; + void *zaddr; + unsigned int dlen = PAGE_SIZE; + struct zcache_ra_handle *zhandle; + struct zcache_pool *zpool = zcache.pools[pool_id]; + + zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index); + if (!zaddr) + return -ENOENT; + else if (zaddr == ZERO_HANDLE) + goto map; + + zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, + (unsigned long)zaddr); + /* Compressed page data stored at the end of zcache_ra_handle */ + src = (u8 *)(zhandle + 1); + + /* decompress */ +map: + dst = kmap_atomic(page); + if (zaddr != ZERO_HANDLE) { + ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, src, + zhandle->zlen, dst, &dlen); + } else { + memset(dst, 0, PAGE_SIZE); + kunmap_atomic(dst); + flush_dcache_page(page); + atomic_dec(&zcache_stored_zero_pages); + goto out; + } + kunmap_atomic(dst); + zbud_unmap(zpool->pool, (unsigned long)zaddr); + zbud_free(zpool->pool, (unsigned long)zaddr); + + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); + + /* update stats */ + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); +out: + SetPageWasActive(page); + return ret; +} + +static void zcache_flush_page(int pool_id, struct cleancache_filekey key, + pgoff_t index) +{ + struct zcache_pool *zpool = zcache.pools[pool_id]; + void *zaddr = NULL; + + zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index); + if (zaddr && (zaddr != ZERO_HANDLE)) { + zbud_free(zpool->pool, (unsigned long)zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } else if (zaddr == ZERO_HANDLE) { + atomic_dec(&zcache_stored_zero_pages); + } +} + +#define FREE_BATCH 16 +/* + * Callers must hold the lock + */ +static void zcache_flush_ratree(struct zcache_pool *zpool, + struct zcache_rbnode *rbnode) +{ + unsigned long index = 0; + int count, i; + struct zcache_ra_handle *zhandle; + void *zaddr = NULL; + + do { + void *zaddrs[FREE_BATCH]; + unsigned long indices[FREE_BATCH]; + + count = radix_tree_gang_lookup_index(&rbnode->ratree, + (void **)zaddrs, indices, + index, FREE_BATCH); + + for (i = 0; i < count; i++) { + if (zaddrs[i] == ZERO_HANDLE) { + zaddr = radix_tree_delete(&rbnode->ratree, + indices[i]); + if (zaddr) + atomic_dec(&zcache_stored_zero_pages); + continue; + } + zhandle = (struct zcache_ra_handle *)zbud_map( + zpool->pool, (unsigned long)zaddrs[i]); + index = zhandle->ra_index; + zaddr = radix_tree_delete(&rbnode->ratree, index); + if (!zaddr) + continue; + zbud_unmap(zpool->pool, (unsigned long)zaddrs[i]); + zbud_free(zpool->pool, (unsigned long)zaddrs[i]); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + + index++; + } while (count == FREE_BATCH); +} + +static void zcache_flush_inode(int pool_id, struct cleancache_filekey key) +{ + struct zcache_rbnode *rbnode; + unsigned long flags1, flags2; + struct zcache_pool *zpool = zcache.pools[pool_id]; + + /* + * Refuse new pages added in to the same rbinode, so get rb_lock at + * first. + */ + write_lock_irqsave(&zpool->rb_lock, flags1); + rbnode = zcache_find_rbnode(&zpool->rbtree, key.u.ino, 0, 0); + if (!rbnode) { + write_unlock_irqrestore(&zpool->rb_lock, flags1); + return; + } + + kref_get(&rbnode->refcount); + spin_lock_irqsave(&rbnode->ra_lock, flags2); + + zcache_flush_ratree(zpool, rbnode); + if (zcache_rbnode_empty(rbnode)) + /* When arrvied here, we already hold rb_lock */ + zcache_rbnode_isolate(zpool, rbnode, 1); + + spin_unlock_irqrestore(&rbnode->ra_lock, flags2); + write_unlock_irqrestore(&zpool->rb_lock, flags1); + kref_put(&rbnode->refcount, zcache_rbnode_release); +} + +static void zcache_destroy_pool(struct zcache_pool *zpool); +static void zcache_flush_fs(int pool_id) +{ + struct zcache_rbnode *z_rbnode = NULL; + struct rb_node *rbnode; + unsigned long flags1, flags2; + struct zcache_pool *zpool; + + if (pool_id < 0) + return; + + zpool = zcache.pools[pool_id]; + if (!zpool) + return; + + /* + * Refuse new pages added in, so get rb_lock at first. + */ + write_lock_irqsave(&zpool->rb_lock, flags1); + + rbnode = rb_first(&zpool->rbtree); + while (rbnode) { + z_rbnode = rb_entry(rbnode, struct zcache_rbnode, rb_node); + rbnode = rb_next(rbnode); + if (z_rbnode) { + kref_get(&z_rbnode->refcount); + spin_lock_irqsave(&z_rbnode->ra_lock, flags2); + zcache_flush_ratree(zpool, z_rbnode); + if (zcache_rbnode_empty(z_rbnode)) + zcache_rbnode_isolate(zpool, z_rbnode, 1); + spin_unlock_irqrestore(&z_rbnode->ra_lock, flags2); + kref_put(&z_rbnode->refcount, zcache_rbnode_release); + } + } + + write_unlock_irqrestore(&zpool->rb_lock, flags1); + zcache_destroy_pool(zpool); +} + +/* + * Evict compressed pages from zcache pool on an LRU basis after the compressed + * pool is full. + */ +static int zcache_evict_zpage(struct zbud_pool *pool, unsigned long zaddr) +{ + struct zcache_pool *zpool; + struct zcache_ra_handle *zhandle; + void *zaddr_intree; + + BUG_ON(zaddr == (unsigned long)ZERO_HANDLE); + + zhandle = (struct zcache_ra_handle *)zbud_map(pool, zaddr); + + zpool = zhandle->zpool; + /* There can be a race with zcache store */ + if (!zpool) + return -EINVAL; + + BUG_ON(pool != zpool->pool); + + zaddr_intree = zcache_load_delete_zaddr(zpool, zhandle->rb_index, + zhandle->ra_index); + if (zaddr_intree) { + BUG_ON((unsigned long)zaddr_intree != zaddr); + zbud_unmap(pool, zaddr); + zbud_free(pool, zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(pool); + zcache_evict_zpages++; + } + return 0; +} + +static struct zbud_ops zcache_zbud_ops = { + .evict = zcache_evict_zpage +}; + +/* Return pool id */ +static int zcache_create_pool(void) +{ + int ret; + struct zcache_pool *zpool; + + zpool = kzalloc(sizeof(*zpool), GFP_KERNEL); + if (!zpool) { + ret = -ENOMEM; + goto out; + } + + zpool->pool = zbud_create_pool(GFP_KERNEL, &zcache_zbud_ops); + if (!zpool->pool) { + kfree(zpool); + ret = -ENOMEM; + goto out; + } + + spin_lock(&zcache.pool_lock); + if (zcache.num_pools == MAX_ZCACHE_POOLS) { + pr_err("Cannot create new pool (limit:%u)\n", MAX_ZCACHE_POOLS); + zbud_destroy_pool(zpool->pool); + kfree(zpool); + ret = -EPERM; + goto out_unlock; + } + + rwlock_init(&zpool->rb_lock); + zpool->rbtree = RB_ROOT; + /* Add to pool list */ + for (ret = 0; ret < MAX_ZCACHE_POOLS; ret++) + if (!zcache.pools[ret]) + break; + zcache.pools[ret] = zpool; + zcache.num_pools++; + pr_info("New pool created id:%d\n", ret); + +out_unlock: + spin_unlock(&zcache.pool_lock); +out: + return ret; +} + +static void zcache_destroy_pool(struct zcache_pool *zpool) +{ + int i; + + if (!zpool) + return; + + spin_lock(&zcache.pool_lock); + zcache.num_pools--; + for (i = 0; i < MAX_ZCACHE_POOLS; i++) + if (zcache.pools[i] == zpool) + break; + zcache.pools[i] = NULL; + spin_unlock(&zcache.pool_lock); + + if (!RB_EMPTY_ROOT(&zpool->rbtree)) + WARN_ON("Memory leak detected. Freeing non-empty pool!\n"); + + zbud_destroy_pool(zpool->pool); + kfree(zpool); +} + +static int zcache_init_fs(size_t pagesize) +{ + int ret; + + if (pagesize != PAGE_SIZE) { + pr_info("Unsupported page size: %zu", pagesize); + ret = -EINVAL; + goto out; + } + + ret = zcache_create_pool(); + if (ret < 0) { + pr_info("Failed to create new pool\n"); + ret = -ENOMEM; + goto out; + } +out: + return ret; +} + +static int zcache_init_shared_fs(char *uuid, size_t pagesize) +{ + /* shared pools are unsupported and map to private */ + return zcache_init_fs(pagesize); +} + +static struct cleancache_ops zcache_ops = { + .put_page = zcache_store_page, + .get_page = zcache_load_page, + .invalidate_page = zcache_flush_page, + .invalidate_inode = zcache_flush_inode, + .invalidate_fs = zcache_flush_fs, + .init_shared_fs = zcache_init_shared_fs, + .init_fs = zcache_init_fs +}; + +/* + * Debugfs functions + */ +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> + +static int pool_pages_get(void *_data, u64 *val) +{ + *val = zcache_pages(); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(pool_page_fops, pool_pages_get, NULL, "%llu\n"); + +static struct dentry *zcache_debugfs_root; + +static int __init zcache_debugfs_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zcache_debugfs_root = debugfs_create_dir("zcache", NULL); + if (!zcache_debugfs_root) + return -ENOMEM; + + debugfs_create_u64("pool_limit_hit", S_IRUGO, zcache_debugfs_root, + &zcache_pool_limit_hit); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, zcache_debugfs_root, + &zcache_zbud_alloc_fail); + debugfs_create_u64("duplicate_entry", S_IRUGO, zcache_debugfs_root, + &zcache_dup_entry); + debugfs_create_file("pool_pages", S_IRUGO, zcache_debugfs_root, NULL, + &pool_page_fops); + debugfs_create_atomic_t("stored_pages", S_IRUGO, zcache_debugfs_root, + &zcache_stored_pages); + debugfs_create_atomic_t("stored_zero_pages", S_IRUGO, + zcache_debugfs_root, &zcache_stored_zero_pages); + debugfs_create_u64("evicted_zpages", S_IRUGO, zcache_debugfs_root, + &zcache_evict_zpages); + debugfs_create_u64("evicted_filepages", S_IRUGO, zcache_debugfs_root, + &zcache_evict_filepages); + debugfs_create_u64("reclaim_fail", S_IRUGO, zcache_debugfs_root, + &zcache_reclaim_fail); + debugfs_create_u64("inactive_pages_refused", S_IRUGO, + zcache_debugfs_root, &zcache_inactive_pages_refused); + debugfs_create_u64("pool_shrink_count", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink); + debugfs_create_u64("pool_shrink_fail", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink_fail); + debugfs_create_u64("pool_shrink_pages", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink_pages); + debugfs_create_u64("store_fail", S_IRUGO, + zcache_debugfs_root, &zcache_store_failed); + return 0; +} + +static void __exit zcache_debugfs_exit(void) +{ + debugfs_remove_recursive(zcache_debugfs_root); +} +#else +static int __init zcache_debugfs_init(void) +{ + return 0; +} +static void __exit zcache_debugfs_exit(void) +{ +} +#endif + +/* + * zcache init and exit + */ +static int __init init_zcache(void) +{ + if (!zcache_enabled) + return 0; + + pr_info("loading zcache..\n"); + if (zcache_rbnode_cache_create()) { + pr_err("entry cache creation failed\n"); + goto error; + } + + if (zcache_comp_init()) { + pr_err("compressor initialization failed\n"); + goto compfail; + } + if (zcache_cpu_init()) { + pr_err("per-cpu initialization failed\n"); + goto pcpufail; + } + + spin_lock_init(&zcache.pool_lock); + cleancache_register_ops(&zcache_ops); + + if (zcache_debugfs_init()) + pr_warn("debugfs initialization failed\n"); + register_shrinker(&zcache_shrinker); + return 0; +pcpufail: + zcache_comp_exit(); +compfail: + zcache_rbnode_cache_destroy(); +error: + return -ENOMEM; +} + +/* must be late so crypto has time to come up */ +late_initcall(init_zcache); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bob Liu <bob.liu@xxxxxxxxxx>"); +MODULE_DESCRIPTION("Compressed cache for clean file pages"); + diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c1ea19478119..1eb00e343523 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -16,32 +16,15 @@ * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->private: points to the first component (0-order) page - * page->index (union with page->freelist): offset of the first object - * starting in this page. For the first page, this is - * always 0, so we use this field (aka freelist) to point - * to the first free object in zspage. - * page->lru: links together all component pages (except the first page) - * of a zspage - * - * For _first_ page only: - * - * page->private: refers to the component page after the first page - * If the page is first_page for huge object, it stores handle. - * Look at size_class->huge. - * page->freelist: points to the first free object in zspage. - * Free objects are linked together using in-place - * metadata. - * page->objects: maximum number of objects we can store in this - * zspage (class->zspage_order * PAGE_SIZE / class->size) - * page->lru: links together first pages of various zspages. - * Basically forming list of zspages in a fullness group. - * page->mapping: class index and fullness group of the zspage - * page->inuse: the number of objects that are used in this zspage + * page->private: points to zspage + * page->freelist(index): links together all component pages of a zspage + * For the huge page, this is always 0, so we use this field + * to store handle. * * Usage of struct page flags: * PG_private: identifies the first component page * PG_private2: identifies the last component page + * PG_owner_priv_1: indentifies the huge component page * */ @@ -64,6 +47,11 @@ #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> +#include <linux/mount.h> +#include <linux/migrate.h> +#include <linux/pagemap.h> + +#define ZSPAGE_MAGIC 0x58 /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -86,9 +74,7 @@ * Object location (<PFN>, <obj_idx>) is encoded as * as single (unsigned long) handle value. * - * Note that object index <obj_idx> is relative to system - * page <PFN> it is stored in, so for each sub-page belonging - * to a zspage, obj_idx starts with 0. + * Note that object index <obj_idx> starts from 0. * * This is made more complicated by various memory models and PAE. */ @@ -147,33 +133,29 @@ * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN * (reason above) */ -#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) +#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) /* * We do not maintain any list for completely empty or full pages */ enum fullness_group { - ZS_ALMOST_FULL, - ZS_ALMOST_EMPTY, - _ZS_NR_FULLNESS_GROUPS, - ZS_EMPTY, - ZS_FULL + ZS_ALMOST_EMPTY, + ZS_ALMOST_FULL, + ZS_FULL, + NR_ZS_FULLNESS, }; enum zs_stat_type { + CLASS_EMPTY, + CLASS_ALMOST_EMPTY, + CLASS_ALMOST_FULL, + CLASS_FULL, OBJ_ALLOCATED, OBJ_USED, - CLASS_ALMOST_FULL, - CLASS_ALMOST_EMPTY, + NR_ZS_STAT_TYPE, }; -#ifdef CONFIG_ZSMALLOC_STAT -#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) -#else -#define NR_ZS_STAT_TYPE (OBJ_USED + 1) -#endif - struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; @@ -182,6 +164,10 @@ struct zs_size_stat { static struct dentry *zs_stat_root; #endif +#ifdef CONFIG_COMPACTION +static struct vfsmount *zsmalloc_mnt; +#endif + /* * number of size_classes */ @@ -205,35 +191,49 @@ static const int fullness_threshold_frac = 4; struct size_class { spinlock_t lock; - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; + struct list_head fullness_list[NR_ZS_FULLNESS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. */ int size; - unsigned int index; - + int objs_per_zspage; /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; - struct zs_size_stat stats; - /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ - bool huge; + unsigned int index; + struct zs_size_stat stats; }; +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetPageHugeObject(struct page *page) +{ + SetPageOwnerPriv1(page); +} + +static void ClearPageHugeObject(struct page *page) +{ + ClearPageOwnerPriv1(page); +} + +static int PageHugeObject(struct page *page) +{ + return PageOwnerPriv1(page); +} + /* * Placed within free objects to form a singly linked list. - * For every zspage, first_page->freelist gives head of this list. + * For every zspage, zspage->freeobj gives head of this list. * * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { union { /* - * Position of next free chunk (encodes <PFN, obj_idx>) + * Free object index; * It's valid for non-allocated object */ - void *next; + unsigned long next; /* * Handle of allocated object. */ @@ -246,8 +246,8 @@ struct zs_pool { struct size_class **size_class; struct kmem_cache *handle_cachep; + struct kmem_cache *zspage_cachep; - gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; struct zs_pool_stats stats; @@ -262,16 +262,36 @@ struct zs_pool { #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif +#ifdef CONFIG_COMPACTION + struct inode *inode; + struct work_struct free_work; +#endif }; /* * A zspage's class index and fullness group * are encoded in its (first)page->mapping */ -#define CLASS_IDX_BITS 28 -#define FULLNESS_BITS 4 -#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) -#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) +#define FULLNESS_BITS 2 +#define CLASS_BITS 8 +#define ISOLATED_BITS 3 +#define MAGIC_VAL_BITS 8 + +struct zspage { + struct { + unsigned int fullness:FULLNESS_BITS; + unsigned int class:CLASS_BITS; + unsigned int isolated:ISOLATED_BITS; + unsigned int magic:MAGIC_VAL_BITS; + }; + unsigned int inuse; + unsigned int freeobj; + struct page *first_page; + struct list_head list; /* fullness list */ +#ifdef CONFIG_COMPACTION + rwlock_t lock; +#endif +}; struct mapping_area { #ifdef CONFIG_PGTABLE_MAPPING @@ -281,32 +301,76 @@ struct mapping_area { #endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ - bool huge; }; -static int create_handle_cache(struct zs_pool *pool) +#ifdef CONFIG_COMPACTION +static int zs_register_migration(struct zs_pool *pool); +static void zs_unregister_migration(struct zs_pool *pool); +static void migrate_lock_init(struct zspage *zspage); +static void migrate_read_lock(struct zspage *zspage); +static void migrate_read_unlock(struct zspage *zspage); +static void kick_deferred_free(struct zs_pool *pool); +static void init_deferred_free(struct zs_pool *pool); +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); +#else +static int zsmalloc_mount(void) { return 0; } +static void zsmalloc_unmount(void) {} +static int zs_register_migration(struct zs_pool *pool) { return 0; } +static void zs_unregister_migration(struct zs_pool *pool) {} +static void migrate_lock_init(struct zspage *zspage) {} +static void migrate_read_lock(struct zspage *zspage) {} +static void migrate_read_unlock(struct zspage *zspage) {} +static void kick_deferred_free(struct zs_pool *pool) {} +static void init_deferred_free(struct zs_pool *pool) {} +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} +#endif + +static int create_cache(struct zs_pool *pool) { pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0, NULL); - return pool->handle_cachep ? 0 : 1; + if (!pool->handle_cachep) + return 1; + + pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), + 0, 0, NULL); + if (!pool->zspage_cachep) { + kmem_cache_destroy(pool->handle_cachep); + pool->handle_cachep = NULL; + return 1; + } + + return 0; } -static void destroy_handle_cache(struct zs_pool *pool) +static void destroy_cache(struct zs_pool *pool) { kmem_cache_destroy(pool->handle_cachep); + kmem_cache_destroy(pool->zspage_cachep); } -static unsigned long alloc_handle(struct zs_pool *pool) +static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - pool->flags & ~__GFP_HIGHMEM); + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } -static void free_handle(struct zs_pool *pool, unsigned long handle) +static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { kmem_cache_free(pool->handle_cachep, (void *)handle); } +static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) +{ + return kmem_cache_alloc(pool->zspage_cachep, + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); +}; + +static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) +{ + kmem_cache_free(pool->zspage_cachep, zspage); +} + static void record_obj(unsigned long handle, unsigned long obj) { /* @@ -325,7 +389,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { - return zs_create_pool(name, gfp); + /* + * Ignore global gfp flags: zs_malloc() may be invoked from + * different contexts and its caller must provide a valid + * gfp mask. + */ + return zs_create_pool(name); } static void zs_zpool_destroy(void *pool) @@ -336,7 +405,7 @@ static void zs_zpool_destroy(void *pool) static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { - *handle = zs_malloc(pool, size); + *handle = zs_malloc(pool, size, gfp); return *handle ? 0 : -1; } static void zs_zpool_free(void *pool, unsigned long handle) @@ -404,36 +473,76 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +static bool is_zspage_isolated(struct zspage *zspage) +{ + return zspage->isolated; +} + static int is_first_page(struct page *page) { return PagePrivate(page); } -static int is_last_page(struct page *page) +/* Protected by class->lock */ +static inline int get_zspage_inuse(struct zspage *zspage) +{ + return zspage->inuse; +} + +static inline void set_zspage_inuse(struct zspage *zspage, int val) +{ + zspage->inuse = val; +} + +static inline void mod_zspage_inuse(struct zspage *zspage, int val) +{ + zspage->inuse += val; +} + +static inline struct page *get_first_page(struct zspage *zspage) { - return PagePrivate2(page); + struct page *first_page = zspage->first_page; + + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + return first_page; } -static void get_zspage_mapping(struct page *page, unsigned int *class_idx, +static inline int get_first_obj_offset(struct page *page) +{ + return page->units; +} + +static inline void set_first_obj_offset(struct page *page, int offset) +{ + page->units = offset; +} + +static inline unsigned int get_freeobj(struct zspage *zspage) +{ + return zspage->freeobj; +} + +static inline void set_freeobj(struct zspage *zspage, unsigned int obj) +{ + zspage->freeobj = obj; +} + +static void get_zspage_mapping(struct zspage *zspage, + unsigned int *class_idx, enum fullness_group *fullness) { - unsigned long m; - BUG_ON(!is_first_page(page)); + BUG_ON(zspage->magic != ZSPAGE_MAGIC); - m = (unsigned long)page->mapping; - *fullness = m & FULLNESS_MASK; - *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; + *fullness = zspage->fullness; + *class_idx = zspage->class; } -static void set_zspage_mapping(struct page *page, unsigned int class_idx, +static void set_zspage_mapping(struct zspage *zspage, + unsigned int class_idx, enum fullness_group fullness) { - unsigned long m; - BUG_ON(!is_first_page(page)); - - m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | - (fullness & FULLNESS_MASK); - page->mapping = (struct address_space *)m; + zspage->class = class_idx; + zspage->fullness = fullness; } /* @@ -457,23 +566,19 @@ static int get_size_class_index(int size) static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - if (type < NR_ZS_STAT_TYPE) - class->stats.objs[type] += cnt; + class->stats.objs[type] += cnt; } static inline void zs_stat_dec(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - if (type < NR_ZS_STAT_TYPE) - class->stats.objs[type] -= cnt; + class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, enum zs_stat_type type) { - if (type < NR_ZS_STAT_TYPE) - return class->stats.objs[type]; - return 0; + return class->stats.objs[type]; } #ifdef CONFIG_ZSMALLOC_STAT @@ -495,6 +600,8 @@ static void __exit zs_stat_exit(void) debugfs_remove_recursive(zs_stat_root); } +static unsigned long zs_can_compact(struct size_class *class); + static int zs_stats_size_show(struct seq_file *s, void *v) { int i; @@ -502,14 +609,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v) struct size_class *class; int objs_per_zspage; unsigned long class_almost_full, class_almost_empty; - unsigned long obj_allocated, obj_used, pages_used; + unsigned long obj_allocated, obj_used, pages_used, freeable; unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + unsigned long total_freeable = 0; - seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", "class", "size", "almost_full", "almost_empty", "obj_allocated", "obj_used", "pages_used", - "pages_per_zspage"); + "pages_per_zspage", "freeable"); for (i = 0; i < zs_size_classes; i++) { class = pool->size_class[i]; @@ -522,6 +630,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); obj_used = zs_stat_get(class, OBJ_USED); + freeable = zs_can_compact(class); spin_unlock(&class->lock); objs_per_zspage = get_maxobj_per_zspage(class->size, @@ -529,23 +638,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v) pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; - seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", + seq_printf(s, " %5u %5u %11lu %12lu %13lu" + " %10lu %10lu %16d %8lu\n", i, class->size, class_almost_full, class_almost_empty, obj_allocated, obj_used, pages_used, - class->pages_per_zspage); + class->pages_per_zspage, freeable); total_class_almost_full += class_almost_full; total_class_almost_empty += class_almost_empty; total_objs += obj_allocated; total_used_objs += obj_used; total_pages += pages_used; + total_freeable += freeable; } seq_puts(s, "\n"); - seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", "Total", "", total_class_almost_full, total_class_almost_empty, total_objs, - total_used_objs, total_pages); + total_used_objs, total_pages, "", total_freeable); return 0; } @@ -562,7 +673,7 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(const char *name, struct zs_pool *pool) +static int zs_pool_stat_create(struct zs_pool *pool, const char *name) { struct dentry *entry; @@ -602,7 +713,7 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) +static inline int zs_pool_stat_create(struct zs_pool *pool, const char *name) { return 0; } @@ -620,20 +731,20 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) * the pool (not yet implemented). This function returns fullness * status of the given page. */ -static enum fullness_group get_fullness_group(struct page *page) +static enum fullness_group get_fullness_group(struct size_class *class, + struct zspage *zspage) { - int inuse, max_objects; + int inuse, objs_per_zspage; enum fullness_group fg; - BUG_ON(!is_first_page(page)); - inuse = page->inuse; - max_objects = page->objects; + inuse = get_zspage_inuse(zspage); + objs_per_zspage = class->objs_per_zspage; if (inuse == 0) fg = ZS_EMPTY; - else if (inuse == max_objects) + else if (inuse == objs_per_zspage) fg = ZS_FULL; - else if (inuse <= 3 * max_objects / fullness_threshold_frac) + else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac) fg = ZS_ALMOST_EMPTY; else fg = ZS_ALMOST_FULL; @@ -647,59 +758,41 @@ static enum fullness_group get_fullness_group(struct page *page) * have. This functions inserts the given zspage into the freelist * identified by <class, fullness_group>. */ -static void insert_zspage(struct page *page, struct size_class *class, +static void insert_zspage(struct size_class *class, + struct zspage *zspage, enum fullness_group fullness) { - struct page **head; - - BUG_ON(!is_first_page(page)); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? - CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); - - head = &class->fullness_list[fullness]; - if (!*head) { - *head = page; - return; - } + struct zspage *head; + zs_stat_inc(class, fullness, 1); + head = list_first_entry_or_null(&class->fullness_list[fullness], + struct zspage, list); /* - * We want to see more ZS_FULL pages and less almost - * empty/full. Put pages with higher ->inuse first. + * We want to see more ZS_FULL pages and less almost empty/full. + * Put pages with higher ->inuse first. */ - list_add_tail(&page->lru, &(*head)->lru); - if (page->inuse >= (*head)->inuse) - *head = page; + if (head) { + if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) { + list_add(&zspage->list, &head->list); + return; + } + } + list_add(&zspage->list, &class->fullness_list[fullness]); } /* * This function removes the given zspage from the freelist identified * by <class, fullness_group>. */ -static void remove_zspage(struct page *page, struct size_class *class, +static void remove_zspage(struct size_class *class, + struct zspage *zspage, enum fullness_group fullness) { - struct page **head; - - BUG_ON(!is_first_page(page)); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - head = &class->fullness_list[fullness]; - BUG_ON(!*head); - if (list_empty(&(*head)->lru)) - *head = NULL; - else if (*head == page) - *head = (struct page *)list_entry((*head)->lru.next, - struct page, lru); + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); + VM_BUG_ON(is_zspage_isolated(zspage)); - list_del_init(&page->lru); - zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? - CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); + list_del_init(&zspage->list); + zs_stat_dec(class, fullness, 1); } /* @@ -712,21 +805,22 @@ static void remove_zspage(struct page *page, struct size_class *class, * fullness group. */ static enum fullness_group fix_fullness_group(struct size_class *class, - struct page *page) + struct zspage *zspage) { int class_idx; enum fullness_group currfg, newfg; - BUG_ON(!is_first_page(page)); - - get_zspage_mapping(page, &class_idx, &currfg); - newfg = get_fullness_group(page); + get_zspage_mapping(zspage, &class_idx, &currfg); + newfg = get_fullness_group(class, zspage); if (newfg == currfg) goto out; - remove_zspage(page, class, currfg); - insert_zspage(page, class, newfg); - set_zspage_mapping(page, class_idx, newfg); + if (!is_zspage_isolated(zspage)) { + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); + } + + set_zspage_mapping(zspage, class_idx, newfg); out: return newfg; @@ -768,64 +862,49 @@ static int get_pages_per_zspage(int class_size) return max_usedpc_order; } -/* - * A single 'zspage' is composed of many system pages which are - * linked together using fields in struct page. This function finds - * the first/head page, given any component page of a zspage. - */ -static struct page *get_first_page(struct page *page) +static struct zspage *get_zspage(struct page *page) { - if (is_first_page(page)) - return page; - else - return (struct page *)page_private(page); + struct zspage *zspage = (struct zspage *)page->private; + + BUG_ON(zspage->magic != ZSPAGE_MAGIC); + return zspage; } static struct page *get_next_page(struct page *page) { - struct page *next; + if (unlikely(PageHugeObject(page))) + return NULL; - if (is_last_page(page)) - next = NULL; - else if (is_first_page(page)) - next = (struct page *)page_private(page); - else - next = list_entry(page->lru.next, struct page, lru); + return page->freelist; +} - return next; +/** + * obj_to_location - get (<page>, <obj_idx>) from encoded object value + * @page: page object resides in zspage + * @obj_idx: object index + */ +static void obj_to_location(unsigned long obj, struct page **page, + unsigned int *obj_idx) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); } -/* - * Encode <page, obj_idx> as a single handle value. - * We use the least bit of handle for tagging. +/** + * location_to_obj - get obj value encoded from (<page>, <obj_idx>) + * @page: page object resides in zspage + * @obj_idx: object index */ -static void *location_to_obj(struct page *page, unsigned long obj_idx) +static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) { unsigned long obj; - if (!page) { - BUG_ON(obj_idx); - return NULL; - } - obj = page_to_pfn(page) << OBJ_INDEX_BITS; - obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj |= obj_idx & OBJ_INDEX_MASK; obj <<= OBJ_TAG_BITS; - return (void *)obj; -} - -/* - * Decode <page, obj_idx> pair from the given object handle. We adjust the - * decoded obj_idx back to its original value since it was adjusted in - * location_to_obj(). - */ -static void obj_to_location(unsigned long obj, struct page **page, - unsigned long *obj_idx) -{ - obj >>= OBJ_TAG_BITS; - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); - *obj_idx = (obj & OBJ_INDEX_MASK); + return obj; } static unsigned long handle_to_obj(unsigned long handle) @@ -833,108 +912,146 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static unsigned long obj_to_head(struct size_class *class, struct page *page, - void *obj) +static unsigned long obj_to_head(struct page *page, void *obj) { - if (class->huge) { - VM_BUG_ON(!is_first_page(page)); - return page_private(page); + if (unlikely(PageHugeObject(page))) { + VM_BUG_ON_PAGE(!is_first_page(page), page); + return page->index; } else return *(unsigned long *)obj; } -static unsigned long obj_idx_to_offset(struct page *page, - unsigned long obj_idx, int class_size) +static inline int testpin_tag(unsigned long handle) { - unsigned long off = 0; - - if (!is_first_page(page)) - off = page->index; - - return off + obj_idx * class_size; + return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); } static inline int trypin_tag(unsigned long handle) { - unsigned long *ptr = (unsigned long *)handle; - - return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void pin_tag(unsigned long handle) { - while (!trypin_tag(handle)); + bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void unpin_tag(unsigned long handle) { - unsigned long *ptr = (unsigned long *)handle; - - clear_bit_unlock(HANDLE_PIN_BIT, ptr); + bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); } static void reset_page(struct page *page) { + __ClearPageMovable(page); clear_bit(PG_private, &page->flags); clear_bit(PG_private_2, &page->flags); set_page_private(page, 0); - page->mapping = NULL; - page->freelist = NULL; page_mapcount_reset(page); + ClearPageHugeObject(page); + page->freelist = NULL; } -static void free_zspage(struct page *first_page) +/* + * To prevent zspage destroy during migration, zspage freeing should + * hold locks of all pages in the zspage. + */ +void lock_zspage(struct zspage *zspage) { - struct page *nextp, *tmp, *head_extra; + struct page *page = get_first_page(zspage); - BUG_ON(!is_first_page(first_page)); - BUG_ON(first_page->inuse); + do { + lock_page(page); + } while ((page = get_next_page(page)) != NULL); +} - head_extra = (struct page *)page_private(first_page); +int trylock_zspage(struct zspage *zspage) +{ + struct page *cursor, *fail; - reset_page(first_page); - __free_page(first_page); + for (cursor = get_first_page(zspage); cursor != NULL; cursor = + get_next_page(cursor)) { + if (!trylock_page(cursor)) { + fail = cursor; + goto unlock; + } + } - /* zspage with only 1 system page */ - if (!head_extra) - return; + return 1; +unlock: + for (cursor = get_first_page(zspage); cursor != fail; cursor = + get_next_page(cursor)) + unlock_page(cursor); + + return 0; +} + +static void __free_zspage(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + struct page *page, *next; + enum fullness_group fg; + unsigned int class_idx; + + get_zspage_mapping(zspage, &class_idx, &fg); + + assert_spin_locked(&class->lock); + + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(fg != ZS_EMPTY); + + next = page = get_first_page(zspage); + do { + VM_BUG_ON_PAGE(!PageLocked(page), page); + next = get_next_page(page); + reset_page(page); + unlock_page(page); + put_page(page); + page = next; + } while (page != NULL); - list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { - list_del(&nextp->lru); - reset_page(nextp); - __free_page(nextp); + cache_free_zspage(pool, zspage); + + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); +} + +static void free_zspage(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(list_empty(&zspage->list)); + + if (!trylock_zspage(zspage)) { + kick_deferred_free(pool); + return; } - reset_page(head_extra); - __free_page(head_extra); + + remove_zspage(class, zspage, ZS_EMPTY); + __free_zspage(pool, class, zspage); } /* Initialize a newly allocated zspage */ -static void init_zspage(struct page *first_page, struct size_class *class) +static void init_zspage(struct size_class *class, struct zspage *zspage) { + unsigned int freeobj = 1; unsigned long off = 0; - struct page *page = first_page; + struct page *page = get_first_page(zspage); - BUG_ON(!is_first_page(first_page)); while (page) { struct page *next_page; struct link_free *link; - unsigned int i = 1; void *vaddr; - /* - * page->index stores offset of first object starting - * in the page. For the first page, this is always 0, - * so we use first_page->index (aka ->freelist) to store - * head of corresponding zspage's freelist. - */ - if (page != first_page) - page->index = off; + set_first_obj_offset(page, off); vaddr = kmap_atomic(page); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { - link->next = location_to_obj(page, i++); + link->next = freeobj++ << OBJ_ALLOCATED_TAG; link += class->size / sizeof(*link); } @@ -944,87 +1061,108 @@ static void init_zspage(struct page *first_page, struct size_class *class) * page (if present) */ next_page = get_next_page(page); - link->next = location_to_obj(next_page, 0); + if (next_page) { + link->next = freeobj++ << OBJ_ALLOCATED_TAG; + } else { + /* + * Reset OBJ_ALLOCATED_TAG bit to last link to tell + * whether it's allocated object or not. + */ + link->next = -1 << OBJ_ALLOCATED_TAG; + } kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; } + + set_freeobj(zspage, 0); } -/* - * Allocate a zspage for the given size class - */ -static struct page *alloc_zspage(struct size_class *class, gfp_t flags) +static void create_page_chain(struct size_class *class, struct zspage *zspage, + struct page *pages[]) { - int i, error; - struct page *first_page = NULL, *uninitialized_var(prev_page); + int i; + struct page *page; + struct page *prev_page = NULL; + int nr_pages = class->pages_per_zspage; /* * Allocate individual pages and link them together as: - * 1. first page->private = first sub-page - * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->private + * 1. all pages are linked together using page->freelist + * 2. each sub-page point to zspage using page->private * - * For each size class, First/Head pages are linked together using - * page->lru. Also, we set PG_private to identify the first page - * (i.e. no other sub-page has this flag set) and PG_private_2 to - * identify the last page. + * we set PG_private to identify the first page (i.e. no other sub-page + * has this flag set) and PG_private_2 to identify the last page. */ - error = -ENOMEM; - for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; - - page = alloc_page(flags); - if (!page) - goto cleanup; - - INIT_LIST_HEAD(&page->lru); - if (i == 0) { /* first page */ + for (i = 0; i < nr_pages; i++) { + page = pages[i]; + set_page_private(page, (unsigned long)zspage); + page->freelist = NULL; + if (i == 0) { + zspage->first_page = page; SetPagePrivate(page); - set_page_private(page, 0); - first_page = page; - first_page->inuse = 0; + if (unlikely(class->objs_per_zspage == 1 && + class->pages_per_zspage == 1)) + SetPageHugeObject(page); + } else { + prev_page->freelist = page; } - if (i == 1) - set_page_private(first_page, (unsigned long)page); - if (i >= 1) - set_page_private(page, (unsigned long)first_page); - if (i >= 2) - list_add(&page->lru, &prev_page->lru); - if (i == class->pages_per_zspage - 1) /* last page */ + if (i == nr_pages - 1) SetPagePrivate2(page); prev_page = page; } +} - init_zspage(first_page, class); +/* + * Allocate a zspage for the given size class + */ +static struct zspage *alloc_zspage(struct zs_pool *pool, + struct size_class *class, + gfp_t gfp) +{ + int i; + struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; + struct zspage *zspage = cache_alloc_zspage(pool, gfp); - first_page->freelist = location_to_obj(first_page, 0); - /* Maximum number of objects we can store in this zspage */ - first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; + if (!zspage) + return NULL; - error = 0; /* Success */ + memset(zspage, 0, sizeof(struct zspage)); + zspage->magic = ZSPAGE_MAGIC; + migrate_lock_init(zspage); -cleanup: - if (unlikely(error) && first_page) { - free_zspage(first_page); - first_page = NULL; + for (i = 0; i < class->pages_per_zspage; i++) { + struct page *page; + + page = alloc_page(gfp); + if (!page) { + while (--i >= 0) + __free_page(pages[i]); + cache_free_zspage(pool, zspage); + return NULL; + } + pages[i] = page; } - return first_page; + create_page_chain(class, zspage, pages); + init_zspage(class, zspage); + + return zspage; } -static struct page *find_get_zspage(struct size_class *class) +static struct zspage *find_get_zspage(struct size_class *class) { int i; - struct page *page; + struct zspage *zspage; - for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { - page = class->fullness_list[i]; - if (page) + for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) { + zspage = list_first_entry_or_null(&class->fullness_list[i], + struct zspage, list); + if (zspage) break; } - return page; + return zspage; } #ifdef CONFIG_PGTABLE_MAPPING @@ -1127,11 +1265,9 @@ static void __zs_unmap_object(struct mapping_area *area, goto out; buf = area->vm_buf; - if (!area->huge) { - buf = buf + ZS_HANDLE_SIZE; - size -= ZS_HANDLE_SIZE; - off += ZS_HANDLE_SIZE; - } + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; @@ -1231,11 +1367,9 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } -static bool zspage_full(struct page *page) +static bool zspage_full(struct size_class *class, struct zspage *zspage) { - BUG_ON(!is_first_page(page)); - - return page->inuse == page->objects; + return get_zspage_inuse(zspage) == class->objs_per_zspage; } unsigned long zs_get_total_pages(struct zs_pool *pool) @@ -1261,8 +1395,10 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { + struct zspage *zspage; struct page *page; - unsigned long obj, obj_idx, off; + unsigned long obj, off; + unsigned int obj_idx; unsigned int class_idx; enum fullness_group fg; @@ -1271,23 +1407,26 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, struct page *pages[2]; void *ret; - BUG_ON(!handle); - /* * Because we use per-cpu mapping areas shared among the * pools/users, we can't allow mapping in interrupt context * because it can corrupt another users mappings. */ - BUG_ON(in_interrupt()); + WARN_ON_ONCE(in_interrupt()); /* From now on, migration cannot move the object */ pin_tag(handle); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); + zspage = get_zspage(page); + + /* migration cannot move any subpage in this zspage */ + migrate_read_lock(zspage); + + get_zspage_mapping(zspage, &class_idx, &fg); class = pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); + off = (class->size * obj_idx) & ~PAGE_MASK; area = &get_cpu_var(zs_map_area); area->vm_mm = mm; @@ -1305,7 +1444,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - if (!class->huge) + if (likely(!PageHugeObject(page))) ret += ZS_HANDLE_SIZE; return ret; @@ -1314,21 +1453,22 @@ EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { + struct zspage *zspage; struct page *page; - unsigned long obj, obj_idx, off; + unsigned long obj, off; + unsigned int obj_idx; unsigned int class_idx; enum fullness_group fg; struct size_class *class; struct mapping_area *area; - BUG_ON(!handle); - obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); + zspage = get_zspage(page); + get_zspage_mapping(zspage, &class_idx, &fg); class = pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); + off = (class->size * obj_idx) & ~PAGE_MASK; area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) @@ -1343,38 +1483,50 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } put_cpu_var(zs_map_area); + + migrate_read_unlock(zspage); unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); -static unsigned long obj_malloc(struct page *first_page, - struct size_class *class, unsigned long handle) +static unsigned long obj_malloc(struct size_class *class, + struct zspage *zspage, unsigned long handle) { + int i, nr_page, offset; unsigned long obj; struct link_free *link; struct page *m_page; - unsigned long m_objidx, m_offset; + unsigned long m_offset; void *vaddr; handle |= OBJ_ALLOCATED_TAG; - obj = (unsigned long)first_page->freelist; - obj_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + obj = get_freeobj(zspage); + + offset = obj * class->size; + nr_page = offset >> PAGE_SHIFT; + m_offset = offset & ~PAGE_MASK; + m_page = get_first_page(zspage); + + for (i = 0; i < nr_page; i++) + m_page = get_next_page(m_page); vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); - first_page->freelist = link->next; - if (!class->huge) + set_freeobj(zspage, link->next >> OBJ_ALLOCATED_TAG); + if (likely(!PageHugeObject(m_page))) /* record handle in the header of allocated chunk */ link->handle = handle; else - /* record handle in first_page->private */ - set_page_private(first_page, handle); + /* record handle to page->index */ + zspage->first_page->index = handle; + kunmap_atomic(vaddr); - first_page->inuse++; + mod_zspage_inuse(zspage, 1); zs_stat_inc(class, OBJ_USED, 1); + obj = location_to_obj(m_page, obj); + return obj; } @@ -1388,16 +1540,17 @@ static unsigned long obj_malloc(struct page *first_page, * otherwise 0. * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; - struct page *first_page; + enum fullness_group newfg; + struct zspage *zspage; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - handle = alloc_handle(pool); + handle = cache_alloc_handle(pool, gfp); if (!handle) return 0; @@ -1406,71 +1559,79 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) class = pool->size_class[get_size_class_index(size)]; spin_lock(&class->lock); - first_page = find_get_zspage(class); - - if (!first_page) { + zspage = find_get_zspage(class); + if (likely(zspage)) { + obj = obj_malloc(class, zspage, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, zspage); + record_obj(handle, obj); spin_unlock(&class->lock); - first_page = alloc_zspage(class, pool->flags); - if (unlikely(!first_page)) { - free_handle(pool, handle); - return 0; - } - set_zspage_mapping(first_page, class->index, ZS_EMPTY); - atomic_long_add(class->pages_per_zspage, - &pool->pages_allocated); + return handle; + } - spin_lock(&class->lock); - zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); + spin_unlock(&class->lock); + + zspage = alloc_zspage(pool, class, gfp); + if (!zspage) { + cache_free_handle(pool, handle); + return 0; } - obj = obj_malloc(first_page, class, handle); - /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(class, first_page); + spin_lock(&class->lock); + obj = obj_malloc(class, zspage, handle); + newfg = get_fullness_group(class, zspage); + insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); + zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + + /* We completely set up zspage so mark them as movable */ + SetZsPageMovable(pool, zspage); spin_unlock(&class->lock); return handle; } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(struct zs_pool *pool, struct size_class *class, - unsigned long obj) +static void obj_free(struct size_class *class, unsigned long obj) { struct link_free *link; - struct page *first_page, *f_page; - unsigned long f_objidx, f_offset; + struct zspage *zspage; + struct page *f_page; + unsigned long f_offset; + unsigned int f_objidx; void *vaddr; - BUG_ON(!obj); - obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); - - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); + f_offset = (class->size * f_objidx) & ~PAGE_MASK; + zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - link->next = first_page->freelist; - if (class->huge) - set_page_private(first_page, 0); + link->next = get_freeobj(zspage) << OBJ_ALLOCATED_TAG; kunmap_atomic(vaddr); - first_page->freelist = (void *)obj; - first_page->inuse--; + set_freeobj(zspage, f_objidx); + mod_zspage_inuse(zspage, -1); zs_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) { - struct page *first_page, *f_page; - unsigned long obj, f_objidx; + struct zspage *zspage; + struct page *f_page; + unsigned long obj; + unsigned int f_objidx; int class_idx; struct size_class *class; enum fullness_group fullness; + bool isolated; if (unlikely(!handle)) return; @@ -1478,33 +1639,39 @@ void zs_free(struct zs_pool *pool, unsigned long handle) pin_tag(handle); obj = handle_to_obj(handle); obj_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); + zspage = get_zspage(f_page); + + migrate_read_lock(zspage); - get_zspage_mapping(first_page, &class_idx, &fullness); + get_zspage_mapping(zspage, &class_idx, &fullness); class = pool->size_class[class_idx]; spin_lock(&class->lock); - obj_free(pool, class, obj); - fullness = fix_fullness_group(class, first_page); - if (fullness == ZS_EMPTY) { - zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); - atomic_long_sub(class->pages_per_zspage, - &pool->pages_allocated); - free_zspage(first_page); + obj_free(class, obj); + fullness = fix_fullness_group(class, zspage); + if (fullness != ZS_EMPTY) { + migrate_read_unlock(zspage); + goto out; } + + isolated = is_zspage_isolated(zspage); + migrate_read_unlock(zspage); + /* If zspage is isolated, zs_page_putback will free the zspage */ + if (likely(!isolated)) + free_zspage(pool, class, zspage); +out: + spin_unlock(&class->lock); unpin_tag(handle); - - free_handle(pool, handle); + cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); -static void zs_object_copy(unsigned long dst, unsigned long src, - struct size_class *class) +static void zs_object_copy(struct size_class *class, unsigned long dst, + unsigned long src) { struct page *s_page, *d_page; - unsigned long s_objidx, d_objidx; + unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; int s_size, d_size, size; @@ -1515,8 +1682,8 @@ static void zs_object_copy(unsigned long dst, unsigned long src, obj_to_location(src, &s_page, &s_objidx); obj_to_location(dst, &d_page, &d_objidx); - s_off = obj_idx_to_offset(s_page, s_objidx, class->size); - d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + s_off = (class->size * s_objidx) & ~PAGE_MASK; + d_off = (class->size * d_objidx) & ~PAGE_MASK; if (s_off + class->size > PAGE_SIZE) s_size = PAGE_SIZE - s_off; @@ -1544,7 +1711,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src, kunmap_atomic(d_addr); kunmap_atomic(s_addr); s_page = get_next_page(s_page); - BUG_ON(!s_page); s_addr = kmap_atomic(s_page); d_addr = kmap_atomic(d_page); s_size = class->size - written; @@ -1554,7 +1720,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src, if (d_off >= PAGE_SIZE) { kunmap_atomic(d_addr); d_page = get_next_page(d_page); - BUG_ON(!d_page); d_addr = kmap_atomic(d_page); d_size = class->size - written; d_off = 0; @@ -1569,20 +1734,19 @@ static void zs_object_copy(unsigned long dst, unsigned long src, * Find alloced object in zspage from index object and * return handle. */ -static unsigned long find_alloced_obj(struct page *page, int index, - struct size_class *class) +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int index) { unsigned long head; int offset = 0; unsigned long handle = 0; void *addr = kmap_atomic(page); - if (!is_first_page(page)) - offset = page->index; + offset = get_first_obj_offset(page); offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(class, page, addr + offset); + head = obj_to_head(page, addr + offset); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; if (trypin_tag(handle)) @@ -1599,7 +1763,7 @@ static unsigned long find_alloced_obj(struct page *page, int index, } struct zs_compact_control { - /* Source page for migration which could be a subpage of zspage. */ + /* Source spage for migration which could be a subpage of zspage */ struct page *s_page; /* Destination page for migration which should be a first page * of zspage. */ @@ -1620,7 +1784,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, int ret = 0; while (1) { - handle = find_alloced_obj(s_page, index, class); + handle = find_alloced_obj(class, s_page, index); if (!handle) { s_page = get_next_page(s_page); if (!s_page) @@ -1630,15 +1794,15 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, } /* Stop if there is no more space */ - if (zspage_full(d_page)) { + if (zspage_full(class, get_zspage(d_page))) { unpin_tag(handle); ret = -ENOMEM; break; } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(d_page, class, handle); - zs_object_copy(free_obj, used_obj, class); + free_obj = obj_malloc(class, get_zspage(d_page), handle); + zs_object_copy(class, free_obj, used_obj); index++; /* * record_obj updates handle's value to free_obj and it will @@ -1649,7 +1813,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); - obj_free(pool, class, used_obj); + obj_free(class, used_obj); } /* Remember last position in this iteration */ @@ -1659,71 +1823,423 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, return ret; } -static struct page *isolate_target_page(struct size_class *class) +static struct zspage *isolate_zspage(struct size_class *class, bool source) { int i; - struct page *page; + struct zspage *zspage; + enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL}; - for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { - page = class->fullness_list[i]; - if (page) { - remove_zspage(page, class, i); - break; + if (!source) { + fg[0] = ZS_ALMOST_FULL; + fg[1] = ZS_ALMOST_EMPTY; + } + + for (i = 0; i < 2; i++) { + zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], + struct zspage, list); + if (zspage) { + VM_BUG_ON(is_zspage_isolated(zspage)); + remove_zspage(class, zspage, fg[i]); + return zspage; } } - return page; + return zspage; } /* - * putback_zspage - add @first_page into right class's fullness list - * @pool: target pool + * putback_zspage - add @zspage into right class's fullness list * @class: destination class - * @first_page: target page + * @zspage: target page * - * Return @fist_page's fullness_group + * Return @zspage's fullness_group */ -static enum fullness_group putback_zspage(struct zs_pool *pool, - struct size_class *class, - struct page *first_page) +static enum fullness_group putback_zspage(struct size_class *class, + struct zspage *zspage) { enum fullness_group fullness; - BUG_ON(!is_first_page(first_page)); + VM_BUG_ON(is_zspage_isolated(zspage)); - fullness = get_fullness_group(first_page); - insert_zspage(first_page, class, fullness); - set_zspage_mapping(first_page, class->index, fullness); + fullness = get_fullness_group(class, zspage); + insert_zspage(class, zspage, fullness); + set_zspage_mapping(zspage, class->index, fullness); - if (fullness == ZS_EMPTY) { - zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( - class->size, class->pages_per_zspage)); - atomic_long_sub(class->pages_per_zspage, - &pool->pages_allocated); + return fullness; +} + +#ifdef CONFIG_COMPACTION +static struct dentry *zs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + + return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC); +} + +static struct file_system_type zsmalloc_fs = { + .name = "zsmalloc", + .mount = zs_mount, + .kill_sb = kill_anon_super, +}; + +static int zsmalloc_mount(void) +{ + int ret = 0; + + zsmalloc_mnt = kern_mount(&zsmalloc_fs); + if (IS_ERR(zsmalloc_mnt)) + ret = PTR_ERR(zsmalloc_mnt); + + return ret; +} + +static void zsmalloc_unmount(void) +{ + kern_unmount(zsmalloc_mnt); +} + +static void migrate_lock_init(struct zspage *zspage) +{ + rwlock_init(&zspage->lock); +} + +static void migrate_read_lock(struct zspage *zspage) +{ + read_lock(&zspage->lock); +} - free_zspage(first_page); +static void migrate_read_unlock(struct zspage *zspage) +{ + read_unlock(&zspage->lock); +} + +static void migrate_write_lock(struct zspage *zspage) +{ + write_lock(&zspage->lock); +} + +static void migrate_write_unlock(struct zspage *zspage) +{ + write_unlock(&zspage->lock); +} + +/* Number of isolated subpage for *page migration* in this zspage */ +static void inc_zspage_isolation(struct zspage *zspage) +{ + zspage->isolated++; +} + +static void dec_zspage_isolation(struct zspage *zspage) +{ + zspage->isolated--; +} + +static void replace_sub_page(struct size_class *class, struct zspage *zspage, + struct page *newpage, struct page *oldpage) +{ + struct page *page; + struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + int idx = 0; + + page = get_first_page(zspage); + do { + if (page == oldpage) + pages[idx] = newpage; + else + pages[idx] = page; + idx++; + } while ((page = get_next_page(page)) != NULL); + + create_page_chain(class, zspage, pages); + set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); + if (unlikely(PageHugeObject(oldpage))) + newpage->index = oldpage->index; + __SetPageMovable(newpage, page_mapping(oldpage)); +} + +bool zs_page_isolate(struct page *page, isolate_mode_t mode) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fullness; + struct zspage *zspage; + struct address_space *mapping; + + /* + * Page is locked so zspage couldn't be destroyed. For detail, look at + * lock_zspage in free_zspage. + */ + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + zspage = get_zspage(page); + + /* + * Without class lock, fullness could be stale while class_idx is okay + * because class_idx is constant unless page is freed so we should get + * fullness again under class lock. + */ + get_zspage_mapping(zspage, &class_idx, &fullness); + mapping = page_mapping(page); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + if (get_zspage_inuse(zspage) == 0) { + spin_unlock(&class->lock); + return false; } - return fullness; + /* zspage is isolated for object migration */ + if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { + spin_unlock(&class->lock); + return false; + } + + /* + * If this is first time isolation for the zspage, isolate zspage from + * size_class to prevent further object allocation from the zspage. + */ + if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { + get_zspage_mapping(zspage, &class_idx, &fullness); + remove_zspage(class, zspage, fullness); + } + + inc_zspage_isolation(zspage); + spin_unlock(&class->lock); + + return true; } -static struct page *isolate_source_page(struct size_class *class) +int zs_page_migrate(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fullness; + struct zspage *zspage; + struct page *dummy; + void *s_addr, *d_addr, *addr; + int offset, pos; + unsigned long handle, head; + unsigned long old_obj, new_obj; + unsigned int obj_idx; + int ret = -EAGAIN; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); + + /* Concurrent compactor cannot migrate any subpage in zspage */ + migrate_write_lock(zspage); + get_zspage_mapping(zspage, &class_idx, &fullness); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + offset = get_first_obj_offset(page); + + spin_lock(&class->lock); + if (!get_zspage_inuse(zspage)) { + ret = -EBUSY; + goto unlock_class; + } + + pos = offset; + s_addr = kmap_atomic(page); + while (pos < PAGE_SIZE) { + head = obj_to_head(page, s_addr + pos); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!trypin_tag(handle)) + goto unpin_objects; + } + pos += class->size; + } + + /* + * Here, any user cannot access all objects in the zspage so let's move. + */ + d_addr = kmap_atomic(newpage); + memcpy(d_addr, s_addr, PAGE_SIZE); + kunmap_atomic(d_addr); + + for (addr = s_addr + offset; addr < s_addr + pos; + addr += class->size) { + head = obj_to_head(page, addr); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!testpin_tag(handle)) + BUG(); + + old_obj = handle_to_obj(handle); + obj_to_location(old_obj, &dummy, &obj_idx); + new_obj = (unsigned long)location_to_obj(newpage, + obj_idx); + new_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, new_obj); + } + } + + replace_sub_page(class, zspage, newpage, page); + get_page(newpage); + + dec_zspage_isolation(zspage); + + /* + * Page migration is done so let's putback isolated zspage to + * the list if @page is final isolated subpage in the zspage. + */ + if (!is_zspage_isolated(zspage)) + putback_zspage(class, zspage); + + reset_page(page); + put_page(page); + page = newpage; + + ret = MIGRATEPAGE_SUCCESS; +unpin_objects: + for (addr = s_addr + offset; addr < s_addr + pos; + addr += class->size) { + head = obj_to_head(page, addr); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (!testpin_tag(handle)) + BUG(); + unpin_tag(handle); + } + } + kunmap_atomic(s_addr); +unlock_class: + spin_unlock(&class->lock); + migrate_write_unlock(zspage); + + return ret; +} + +void zs_page_putback(struct page *page) +{ + struct zs_pool *pool; + struct size_class *class; + int class_idx; + enum fullness_group fg; + struct address_space *mapping; + struct zspage *zspage; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); + get_zspage_mapping(zspage, &class_idx, &fg); + mapping = page_mapping(page); + pool = mapping->private_data; + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + dec_zspage_isolation(zspage); + if (!is_zspage_isolated(zspage)) { + fg = putback_zspage(class, zspage); + /* + * Due to page_lock, we cannot free zspage immediately + * so let's defer. + */ + if (fg == ZS_EMPTY) + schedule_work(&pool->free_work); + } + spin_unlock(&class->lock); +} + +const struct address_space_operations zsmalloc_aops = { + .isolate_page = zs_page_isolate, + .migratepage = zs_page_migrate, + .putback_page = zs_page_putback, +}; + +static int zs_register_migration(struct zs_pool *pool) +{ + pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb); + if (IS_ERR(pool->inode)) { + pool->inode = NULL; + return 1; + } + + pool->inode->i_mapping->private_data = pool; + pool->inode->i_mapping->a_ops = &zsmalloc_aops; + return 0; +} + +static void zs_unregister_migration(struct zs_pool *pool) +{ + flush_work(&pool->free_work); + if (pool->inode) + iput(pool->inode); +} + +/* + * Caller should hold page_lock of all pages in the zspage + * In here, we cannot use zspage meta data. + */ +static void async_free_zspage(struct work_struct *work) { int i; - struct page *page = NULL; + struct size_class *class; + unsigned int class_idx; + enum fullness_group fullness; + struct zspage *zspage, *tmp; + LIST_HEAD(free_pages); + struct zs_pool *pool = container_of(work, struct zs_pool, + free_work); - for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { - page = class->fullness_list[i]; - if (!page) + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + if (class->index != i) continue; - remove_zspage(page, class, i); - break; + spin_lock(&class->lock); + list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); + spin_unlock(&class->lock); } - return page; + + list_for_each_entry_safe(zspage, tmp, &free_pages, list) { + list_del(&zspage->list); + lock_zspage(zspage); + + get_zspage_mapping(zspage, &class_idx, &fullness); + VM_BUG_ON(fullness != ZS_EMPTY); + class = pool->size_class[class_idx]; + spin_lock(&class->lock); + __free_zspage(pool, pool->size_class[class_idx], zspage); + spin_unlock(&class->lock); + } +}; + +static void kick_deferred_free(struct zs_pool *pool) +{ + schedule_work(&pool->free_work); +} + +static void init_deferred_free(struct zs_pool *pool) +{ + INIT_WORK(&pool->free_work, async_free_zspage); } +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + WARN_ON(!trylock_page(page)); + __SetPageMovable(page, pool->inode->i_mapping); + unlock_page(page); + } while ((page = get_next_page(page)) != NULL); +} +#endif + /* * * Based on the number of unused allocated objects calculate @@ -1748,22 +2264,20 @@ static unsigned long zs_can_compact(struct size_class *class) static void __zs_compact(struct zs_pool *pool, struct size_class *class) { struct zs_compact_control cc; - struct page *src_page; - struct page *dst_page = NULL; + struct zspage *src_zspage; + struct zspage *dst_zspage = NULL; spin_lock(&class->lock); - while ((src_page = isolate_source_page(class))) { - - BUG_ON(!is_first_page(src_page)); + while ((src_zspage = isolate_zspage(class, true))) { if (!zs_can_compact(class)) break; cc.index = 0; - cc.s_page = src_page; + cc.s_page = get_first_page(src_zspage); - while ((dst_page = isolate_target_page(class))) { - cc.d_page = dst_page; + while ((dst_zspage = isolate_zspage(class, false))) { + cc.d_page = get_first_page(dst_zspage); /* * If there is no more space in dst_page, resched * and see if anyone had allocated another zspage. @@ -1771,23 +2285,25 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) if (!migrate_zspage(pool, class, &cc)) break; - putback_zspage(pool, class, dst_page); + putback_zspage(class, dst_zspage); } /* Stop if we couldn't find slot */ - if (dst_page == NULL) + if (dst_zspage == NULL) break; - putback_zspage(pool, class, dst_page); - if (putback_zspage(pool, class, src_page) == ZS_EMPTY) + putback_zspage(class, dst_zspage); + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + free_zspage(pool, class, src_zspage); pool->stats.pages_compacted += class->pages_per_zspage; + } spin_unlock(&class->lock); cond_resched(); spin_lock(&class->lock); } - if (src_page) - putback_zspage(pool, class, src_page); + if (src_zspage) + putback_zspage(class, src_zspage); spin_unlock(&class->lock); } @@ -1884,7 +2400,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(const char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name) { int i; struct zs_pool *pool; @@ -1894,6 +2410,7 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) if (!pool) return NULL; + init_deferred_free(pool); pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), GFP_KERNEL); if (!pool->size_class) { @@ -1905,7 +2422,7 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) if (!pool->name) goto err; - if (create_handle_cache(pool)) + if (create_cache(pool)) goto err; /* @@ -1916,6 +2433,7 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) int size; int pages_per_zspage; struct size_class *class; + int fullness = 0; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) @@ -1945,18 +2463,21 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) class->size = size; class->index = i; class->pages_per_zspage = pages_per_zspage; - if (pages_per_zspage == 1 && - get_maxobj_per_zspage(size, pages_per_zspage) == 1) - class->huge = true; + class->objs_per_zspage = class->pages_per_zspage * + PAGE_SIZE / class->size; spin_lock_init(&class->lock); pool->size_class[i] = class; + for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; + fullness++) + INIT_LIST_HEAD(&class->fullness_list[fullness]); prev_class = class; } - pool->flags = flags; + if (zs_pool_stat_create(pool, name)) + goto err; - if (zs_pool_stat_create(name, pool)) + if (zs_register_migration(pool)) goto err; /* @@ -1978,6 +2499,7 @@ void zs_destroy_pool(struct zs_pool *pool) int i; zs_unregister_shrinker(pool); + zs_unregister_migration(pool); zs_pool_stat_destroy(pool); for (i = 0; i < zs_size_classes; i++) { @@ -1990,8 +2512,8 @@ void zs_destroy_pool(struct zs_pool *pool) if (class->index != i) continue; - for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { - if (class->fullness_list[fg]) { + for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) { + if (!list_empty(&class->fullness_list[fg])) { pr_info("Freeing non-empty class with size %db, fullness group %d\n", class->size, fg); } @@ -1999,7 +2521,7 @@ void zs_destroy_pool(struct zs_pool *pool) kfree(class); } - destroy_handle_cache(pool); + destroy_cache(pool); kfree(pool->size_class); kfree(pool->name); kfree(pool); @@ -2008,7 +2530,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { - int ret = zs_register_cpu_notifier(); + int ret; + + ret = zsmalloc_mount(); + if (ret) + goto out; + + ret = zs_register_cpu_notifier(); if (ret) goto notifier_fail; @@ -2032,7 +2560,8 @@ stat_fail: #endif notifier_fail: zs_unregister_cpu_notifier(); - + zsmalloc_unmount(); +out: return ret; } @@ -2041,6 +2570,7 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif + zsmalloc_unmount(); zs_unregister_cpu_notifier(); zs_stat_exit(); |