summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig84
-rw-r--r--mm/Kconfig.debug77
-rw-r--r--mm/Makefile10
-rw-r--r--mm/backing-dev.c34
-rw-r--r--mm/balloon_compaction.c94
-rw-r--r--mm/bootmem.c2
-rw-r--r--mm/cma.c33
-rw-r--r--mm/compaction.c402
-rw-r--r--mm/debug-pagealloc.c143
-rw-r--r--mm/debug.c16
-rw-r--r--mm/filemap.c68
-rw-r--r--mm/internal.h11
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kmemleak.c12
-rw-r--r--mm/ksm.c91
-rw-r--r--mm/maccess.c3
-rw-r--r--mm/memblock.c74
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c5
-rw-r--r--mm/memory_hotplug.c58
-rw-r--r--mm/memtest.c4
-rw-r--r--mm/migrate.c278
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/nobootmem.c2
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c44
-rw-r--r--mm/page_alloc.c281
-rw-r--r--mm/page_ext.c11
-rw-r--r--mm/page_isolation.c8
-rw-r--r--mm/page_owner.c263
-rw-r--r--mm/page_poison.c134
-rw-r--r--mm/process_reclaim.c256
-rw-r--r--mm/readahead.c53
-rw-r--r--mm/rmap.c20
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/showmem.c54
-rw-r--r--mm/slub.c16
-rw-r--r--mm/swap.c20
-rw-r--r--mm/swap_ratio.c196
-rw-r--r--mm/swap_state.c8
-rw-r--r--mm/swapfile.c84
-rw-r--r--mm/truncate.c371
-rw-r--r--mm/userfaultfd.c10
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmalloc.c295
-rw-r--r--mm/vmpressure.c176
-rw-r--r--mm/vmscan.c418
-rw-r--r--mm/vmstat.c53
-rw-r--r--mm/zbud.c33
-rw-r--r--mm/zcache.c1169
-rw-r--r--mm/zsmalloc.c1480
52 files changed, 5481 insertions, 1512 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5753f69b23f4..274a315e0684 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -187,7 +187,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
+ depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390 || ARM64)
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -601,6 +601,16 @@ config ZSMALLOC_STAT
information to userspace via debugfs.
If unsure, say N.
+config VMAP_LAZY_PURGING_FACTOR
+ int "multiplier to the size of purged vmap areas"
+ default "8" if ARM
+ default "32"
+ help
+ It is used as a multiplier to the max VA pages purged in a
+ single attempt. For 32-bit in order to reduce fragmentation
+ of vmalloc space, we decrease the default value to "8".
+
+
config GENERIC_EARLY_IOREMAP
bool
@@ -619,6 +629,44 @@ config MAX_STACK_SIZE_MB
A sane initial value is 80 MB.
+config ZCACHE
+ bool "Compressed cache for file pages (EXPERIMENTAL)"
+ depends on CRYPTO && CLEANCACHE
+ select CRYPTO_LZO
+ select ZBUD
+ default n
+ help
+ A compressed cache for file pages.
+ It takes active file pages that are in the process of being reclaimed
+ and attempts to compress them into a dynamically allocated RAM-based
+ memory pool.
+
+ If this process is successful, when those file pages needed again, the
+ I/O reading operation was avoided. This results in a significant performance
+ gains under memory pressure for systems full with file pages.
+
+config BALANCE_ANON_FILE_RECLAIM
+ bool "During reclaim treat anon and file backed pages equally"
+ depends on SWAP
+ help
+ When performing memory reclaim treat anonymous and file backed pages
+ equally.
+ Swapping anonymous pages out to memory can be efficient enough to justify
+ treating anonymous and file backed pages equally.
+
+config KSWAPD_CPU_AFFINITY_MASK
+ string "kswapd cpu affinity mask"
+ depends on SMP
+ help
+ Set the cpu affinity for the kswapd task.
+ There can be power benefits on certain targets when limiting kswapd
+ to run only on certain cores.
+ The cpu affinity bitmask is represented by a hex string where commas
+ group hex digits into chunks. Each chunk defines exactly 32 bits of
+ the resultant bitmask.
+ For example to limit kswapd to the first 4 cores use the following:
+ CONFIG_KSWAPD_CPU_AFFINITY_MASK="f"
+
# For architectures that support deferred memory initialisation
config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
bool
@@ -669,3 +717,37 @@ config ZONE_DEVICE
config FRAME_VECTOR
bool
+
+config FORCE_ALLOC_FROM_DMA_ZONE
+ bool "Force certain memory allocators to always return ZONE_DMA memory"
+ depends on ZONE_DMA
+ help
+ Ensure certain memory allocators always return memory from ZONE_DMA.
+ This option helps ensure that clients who require ZONE_DMA memory are
+ always using ZONE_DMA memory.
+
+ If unsure, say "n".
+
+config PROCESS_RECLAIM
+ bool "Enable process reclaim"
+ depends on PROC_FS
+ default n
+ help
+ It allows to reclaim pages of the process by /proc/pid/reclaim.
+
+ (echo file > /proc/PID/reclaim) reclaims file-backed pages only.
+ (echo anon > /proc/PID/reclaim) reclaims anonymous pages only.
+ (echo all > /proc/PID/reclaim) reclaims all pages.
+
+ (echo addr size-byte > /proc/PID/reclaim) reclaims pages in
+ (addr, addr + size-bytes) of the process.
+
+ Any other vaule is ignored.
+
+config VM_MAX_READAHEAD
+ int "default max readahead window size"
+ default 128
+ help
+ This sets the VM_MAX_READAHEAD value to allow the readahead window
+ to grow to a maximum size of configured. This will benefit sequential
+ read throughput and thus early boot performance.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 957d3da53ddd..7470fd60fc59 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
- This results in a large slowdown, but helps to find certain types
- of memory corruption.
+ Depending on runtime enablement, this results in a small or large
+ slowdown, but helps to find certain types of memory corruption.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
@@ -26,5 +26,76 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
+ By default this option will have a small overhead, e.g. by not
+ allowing the kernel mapping to be backed by large pages on some
+ architectures. Even bigger overhead comes when the debugging is
+ enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
+ command line parameter.
+
+config DEBUG_PAGEALLOC_ENABLE_DEFAULT
+ bool "Enable debug page memory allocations by default?"
+ default n
+ depends on DEBUG_PAGEALLOC
+ ---help---
+ Enable debug page memory allocations by default? This value
+ can be overridden by debug_pagealloc=off|on.
+
+config SLUB_DEBUG_PANIC_ON
+ bool "Enable to Panic on SLUB corruption detection"
+ depends on SLUB_DEBUG
+ help
+ SLUB has a resiliency feature enabled which restores bytes in
+ order for production environments to continue to operate. IN
+ debug options this may not be desirable as it prevents from
+ investigating the root cause which may be rooted within cache
+ or memory.
+
config PAGE_POISONING
- bool
+ bool "Poison pages after freeing"
+ select PAGE_EXTENSION
+ select PAGE_POISONING_NO_SANITY if HIBERNATION
+ ---help---
+ Fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages. The filling of the memory helps
+ reduce the risk of information leaks from freed data. This does
+ have a potential performance impact.
+
+ Note that "poison" here is not the same thing as the "HWPoison"
+ for CONFIG_MEMORY_FAILURE. This is software poisoning only.
+
+ If unsure, say N
+
+config PAGE_POISONING_ENABLE_DEFAULT
+ bool "Enable page poisoning by default?"
+ default n
+ depends on PAGE_POISONING
+ ---help---
+ Enable page poisoning of free pages by default? This value
+ can be overridden by page_poison=off|on. This can be used
+ to avoid passing the kernel parameter and let page poisoning
+ feature enabled by default.
+
+config PAGE_POISONING_NO_SANITY
+ depends on PAGE_POISONING
+ bool "Only poison, don't sanity check"
+ ---help---
+ Skip the sanity checking on alloc, only fill the pages with
+ poison on free. This reduces some of the overhead of the
+ poisoning feature.
+
+ If you are only interested in sanitization, say Y. Otherwise
+ say N.
+
+config PAGE_POISONING_ZERO
+ bool "Use zero for poisoning instead of random data"
+ depends on PAGE_POISONING
+ ---help---
+ Instead of using the existing poison value, fill the pages with
+ zeros. This makes it harder to detect when errors are occurring
+ due to sanitization but the zeroing at free means that it is
+ no longer necessary to write zeros when GFP_ZERO is used on
+ allocation.
+
+ Enabling page poisoning with this option will disable hibernation
+
+ If unsure, say N
diff --git a/mm/Makefile b/mm/Makefile
index ec91e951da28..04d48b46dbe9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -40,7 +40,7 @@ obj-y := filemap.o mempool.o oom_kill.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o $(mmu-y)
+ debug.o $(mmu-y) showmem.o vmpressure.o
obj-y += init-mm.o
@@ -56,9 +56,10 @@ ifdef CONFIG_MMU
endif
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
-obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_ratio.o
obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
+obj-$(CONFIG_ZCACHE) += zcache.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
@@ -67,7 +68,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
-obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
@@ -79,7 +80,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
-obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
+obj-$(CONFIG_MEMCG) += memcontrol.o
obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
@@ -100,4 +101,5 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_PROCESS_RECLAIM) += process_reclaim.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 07e3b3b8e846..b12a49bf78de 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -237,6 +237,7 @@ static __init int bdi_class_init(void)
bdi_class->dev_groups = bdi_dev_groups;
bdi_debug_init();
+
return 0;
}
postcore_initcall(bdi_class_init);
@@ -781,6 +782,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->dev = NULL;
+ kref_init(&bdi->refcnt);
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
@@ -796,6 +798,22 @@ int bdi_init(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_init);
+struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
+{
+ struct backing_dev_info *bdi;
+
+ bdi = kmalloc_node(sizeof(struct backing_dev_info),
+ gfp_mask | __GFP_ZERO, node_id);
+ if (!bdi)
+ return NULL;
+
+ if (bdi_init(bdi)) {
+ kfree(bdi);
+ return NULL;
+ }
+ return bdi;
+}
+
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
@@ -876,12 +894,26 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
}
-void bdi_exit(struct backing_dev_info *bdi)
+static void bdi_exit(struct backing_dev_info *bdi)
{
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
}
+static void release_bdi(struct kref *ref)
+{
+ struct backing_dev_info *bdi =
+ container_of(ref, struct backing_dev_info, refcnt);
+
+ bdi_exit(bdi);
+ kfree(bdi);
+}
+
+void bdi_put(struct backing_dev_info *bdi)
+{
+ kref_put(&bdi->refcnt, release_bdi);
+}
+
void bdi_destroy(struct backing_dev_info *bdi)
{
bdi_unregister(bdi);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 300117f1a08f..6c563a4846c4 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -70,7 +70,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
*/
if (trylock_page(page)) {
#ifdef CONFIG_BALLOON_COMPACTION
- if (!PagePrivate(page)) {
+ if (PageIsolated(page)) {
/* raced with isolation */
unlock_page(page);
continue;
@@ -106,110 +106,50 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
-static inline void __isolate_balloon_page(struct page *page)
+bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
+
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- ClearPagePrivate(page);
list_del(&page->lru);
b_dev_info->isolated_pages++;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+
+ return true;
}
-static inline void __putback_balloon_page(struct page *page)
+void balloon_page_putback(struct page *page)
{
struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- SetPagePrivate(page);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
-/* __isolate_lru_page() counterpart for a ballooned page */
-bool balloon_page_isolate(struct page *page)
-{
- /*
- * Avoid burning cycles with pages that are yet under __free_pages(),
- * or just got freed under us.
- *
- * In case we 'win' a race for a balloon page being freed under us and
- * raise its refcount preventing __free_pages() from doing its job
- * the put_page() at the end of this block will take care of
- * release this page, thus avoiding a nasty leakage.
- */
- if (likely(get_page_unless_zero(page))) {
- /*
- * As balloon pages are not isolated from LRU lists, concurrent
- * compaction threads can race against page migration functions
- * as well as race against the balloon driver releasing a page.
- *
- * In order to avoid having an already isolated balloon page
- * being (wrongly) re-isolated while it is under migration,
- * or to avoid attempting to isolate pages being released by
- * the balloon driver, lets be sure we have the page lock
- * before proceeding with the balloon page isolation steps.
- */
- if (likely(trylock_page(page))) {
- /*
- * A ballooned page, by default, has PagePrivate set.
- * Prevent concurrent compaction threads from isolating
- * an already isolated balloon page by clearing it.
- */
- if (balloon_page_movable(page)) {
- __isolate_balloon_page(page);
- unlock_page(page);
- return true;
- }
- unlock_page(page);
- }
- put_page(page);
- }
- return false;
-}
-
-/* putback_lru_page() counterpart for a ballooned page */
-void balloon_page_putback(struct page *page)
-{
- /*
- * 'lock_page()' stabilizes the page and prevents races against
- * concurrent isolation threads attempting to re-isolate it.
- */
- lock_page(page);
-
- if (__is_movable_balloon_page(page)) {
- __putback_balloon_page(page);
- /* drop the extra ref count taken for page isolation */
- put_page(page);
- } else {
- WARN_ON(1);
- dump_page(page, "not movable balloon page");
- }
- unlock_page(page);
-}
/* move_to_new_page() counterpart for a ballooned page */
-int balloon_page_migrate(struct page *newpage,
- struct page *page, enum migrate_mode mode)
+int balloon_page_migrate(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
struct balloon_dev_info *balloon = balloon_page_device(page);
- int rc = -EAGAIN;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- if (WARN_ON(!__is_movable_balloon_page(page))) {
- dump_page(page, "not movable balloon page");
- return rc;
- }
+ return balloon->migratepage(balloon, newpage, page, mode);
+}
- if (balloon && balloon->migratepage)
- rc = balloon->migratepage(balloon, newpage, page, mode);
+const struct address_space_operations balloon_aops = {
+ .migratepage = balloon_page_migrate,
+ .isolate_page = balloon_page_isolate,
+ .putback_page = balloon_page_putback,
+};
+EXPORT_SYMBOL_GPL(balloon_aops);
- return rc;
-}
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 3b6380784c28..90336470273c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -154,7 +154,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
* down, but we are still initializing the system. Pages are given directly
* to the page allocator, no bootmem metadata is updated because it is gone.
*/
-void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
+void free_bootmem_late(unsigned long physaddr, unsigned long size)
{
unsigned long cursor, end;
diff --git a/mm/cma.c b/mm/cma.c
index 65c7aa419048..34e099574063 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,6 +35,7 @@
#include <linux/cma.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <linux/delay.h>
#include <trace/events/cma.h>
#include "cma.h"
@@ -133,6 +134,10 @@ static int __init cma_activate_area(struct cma *cma)
spin_lock_init(&cma->mem_head_lock);
#endif
+ if (!PageHighMem(pfn_to_page(cma->base_pfn)))
+ kmemleak_free_part(__va(cma->base_pfn << PAGE_SHIFT),
+ cma->count << PAGE_SHIFT);
+
return 0;
err:
@@ -384,6 +389,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
struct page *page = NULL;
int ret;
+ int retry_after_sleep = 0;
if (!cma || !cma->count)
return NULL;
@@ -394,19 +400,40 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
if (!count)
return NULL;
+ trace_cma_alloc_start(count, align);
+
mask = cma_bitmap_aligned_mask(cma, align);
offset = cma_bitmap_aligned_offset(cma, align);
bitmap_maxno = cma_bitmap_maxno(cma);
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+ if (bitmap_count > bitmap_maxno)
+ return NULL;
+
for (;;) {
mutex_lock(&cma->lock);
bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
bitmap_maxno, start, bitmap_count, mask,
offset);
if (bitmap_no >= bitmap_maxno) {
- mutex_unlock(&cma->lock);
- break;
+ if (retry_after_sleep < 2) {
+ start = 0;
+ /*
+ * Page may be momentarily pinned by some other
+ * process which has been scheduled out, eg.
+ * in exit path, during unmap call, or process
+ * fork and so cannot be freed there. Sleep
+ * for 100ms and retry twice to see if it has
+ * been freed later.
+ */
+ mutex_unlock(&cma->lock);
+ msleep(100);
+ retry_after_sleep++;
+ continue;
+ } else {
+ mutex_unlock(&cma->lock);
+ break;
+ }
}
bitmap_set(cma->bitmap, bitmap_no, bitmap_count);
/*
@@ -431,6 +458,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align)
pr_debug("%s(): memory range at %p is busy, retrying\n",
__func__, pfn_to_page(pfn));
+
+ trace_cma_alloc_busy_retry(pfn, pfn_to_page(pfn), count, align);
/* try again with a bit different memory target */
start = bitmap_no + mask + 1;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index b6f145ed7ae1..8cd8bfceae41 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
*/
+#include <linux/cpu.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
@@ -14,9 +15,11 @@
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
-#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/page_owner.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -57,13 +60,27 @@ static unsigned long release_freepages(struct list_head *freelist)
static void map_pages(struct list_head *list)
{
- struct page *page;
+ unsigned int i, order, nr_pages;
+ struct page *page, *next;
+ LIST_HEAD(tmp_list);
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ list_del(&page->lru);
+
+ order = page_private(page);
+ nr_pages = 1 << order;
+
+ post_alloc_hook(page, order, __GFP_MOVABLE);
+ if (order)
+ split_page(page, order);
- list_for_each_entry(page, list, lru) {
- arch_alloc_page(page, 0);
- kernel_map_pages(page, 1, 1);
- kasan_alloc_pages(page, 0);
+ for (i = 0; i < nr_pages; i++) {
+ list_add(&page->lru, &tmp_list);
+ page++;
+ }
}
+
+ list_splice(&tmp_list, list);
}
static inline bool migrate_async_suitable(int migratetype)
@@ -116,6 +133,44 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
#ifdef CONFIG_COMPACTION
+int PageMovable(struct page *page)
+{
+ struct address_space *mapping;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (!__PageMovable(page))
+ return 0;
+
+ mapping = page_mapping(page);
+ if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL(PageMovable);
+
+void __SetPageMovable(struct page *page, struct address_space *mapping)
+{
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
+ page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+}
+EXPORT_SYMBOL(__SetPageMovable);
+
+void __ClearPageMovable(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ /*
+ * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
+ * flag so that VM can catch up released page by driver after isolation.
+ * With it, VM migration doesn't try to put it back.
+ */
+ page->mapping = (void *)((unsigned long)page->mapping &
+ PAGE_MAPPING_MOVABLE);
+}
+EXPORT_SYMBOL(__ClearPageMovable);
+
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6
@@ -403,12 +458,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
+ unsigned int order;
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
- int isolated, i;
+ int isolated;
struct page *page = cursor;
/*
@@ -474,17 +530,17 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
goto isolate_fail;
}
- /* Found a free page, break it into order-0 pages */
- isolated = split_free_page(page);
+ /* Found a free page, will break it into order-0 pages */
+ order = page_order(page);
+ isolated = __isolate_free_page(page, order);
if (!isolated)
break;
+ set_page_private(page, order);
total_isolated += isolated;
cc->nr_freepages += isolated;
- for (i = 0; i < isolated; i++) {
- list_add(&page->lru, freelist);
- page++;
- }
+ list_add_tail(&page->lru, freelist);
+
if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
blockpfn += isolated;
break;
@@ -603,7 +659,7 @@ isolate_freepages_range(struct compact_control *cc,
*/
}
- /* split_free_page does not map the pages */
+ /* __isolate_free_page() does not map the pages */
map_pages(&freelist);
if (pfn < end_pfn) {
@@ -632,21 +688,46 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
}
-/* Similar to reclaim, but different enough that they don't share logic */
-static bool too_many_isolated(struct zone *zone)
+static bool __too_many_isolated(struct zone *zone, int safe)
{
unsigned long active, inactive, isolated;
- inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_ANON);
- active = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_ACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
- zone_page_state(zone, NR_ISOLATED_ANON);
+ if (safe) {
+ inactive = zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_ANON);
+ active = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ACTIVE_ANON);
+ isolated = zone_page_state_snapshot(zone, NR_ISOLATED_FILE) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_INACTIVE_ANON);
+ active = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_ACTIVE_ANON);
+ isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
+ zone_page_state(zone, NR_ISOLATED_ANON);
+ }
return isolated > (inactive + active) / 2;
}
+/* Similar to reclaim, but different enough that they don't share logic */
+static bool too_many_isolated(struct compact_control *cc)
+{
+ /*
+ * __too_many_isolated(safe=0) is fast but inaccurate, because it
+ * doesn't account for the vm_stat_diff[] counters. So if it looks
+ * like too_many_isolated() is about to return true, fall back to the
+ * slower, more accurate zone_page_state_snapshot().
+ */
+ if (unlikely(__too_many_isolated(cc->zone, 0))) {
+ if (cc->mode != MIGRATE_ASYNC)
+ return __too_many_isolated(cc->zone, 1);
+ }
+
+ return false;
+}
+
/**
* isolate_migratepages_block() - isolate all migrate-able pages within
* a single pageblock
@@ -683,7 +764,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
- while (unlikely(too_many_isolated(zone))) {
+ while (unlikely(too_many_isolated(cc))) {
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
return 0;
@@ -699,7 +780,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
- bool is_lru;
/*
* Periodically drop the lock (if held) regardless of its
@@ -740,21 +820,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
/*
- * Check may be lockless but that's ok as we recheck later.
- * It's possible to migrate LRU pages and balloon pages
- * Skip any other type of page
- */
- is_lru = PageLRU(page);
- if (!is_lru) {
- if (unlikely(balloon_page_movable(page))) {
- if (balloon_page_isolate(page)) {
- /* Successfully isolated */
- goto isolate_success;
- }
- }
- }
-
- /*
* Regardless of being on LRU, compound pages such as THP and
* hugetlbfs are not to be compacted. We can potentially save
* a lot of iterations if we skip them at once. The check is
@@ -770,8 +835,30 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
continue;
}
- if (!is_lru)
+ /*
+ * Check may be lockless but that's ok as we recheck later.
+ * It's possible to migrate LRU and non-lru movable pages.
+ * Skip any other type of page
+ */
+ if (!PageLRU(page)) {
+ /*
+ * __PageMovable can return false positive so we need
+ * to verify it under page_lock.
+ */
+ if (unlikely(__PageMovable(page)) &&
+ !PageIsolated(page)) {
+ if (locked) {
+ spin_unlock_irqrestore(&zone->lru_lock,
+ flags);
+ locked = false;
+ }
+
+ if (!isolate_movable_page(page, isolate_mode))
+ goto isolate_success;
+ }
+
continue;
+ }
/*
* Migration will fail if an anonymous page is pinned in memory,
@@ -1026,7 +1113,7 @@ static void isolate_freepages(struct compact_control *cc)
}
}
- /* split_free_page does not map the pages */
+ /* __isolate_free_page() does not map the pages */
map_pages(freelist);
/*
@@ -1218,11 +1305,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/*
* Mark that the PG_migrate_skip information should be cleared
- * by kswapd when it goes to sleep. kswapd does not set the
+ * by kswapd when it goes to sleep. kcompactd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
- if (!current_is_kswapd())
+ if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
return COMPACT_COMPLETE;
@@ -1258,7 +1345,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1)
+ true, cc->order, &can_steal) != -1)
return COMPACT_PARTIAL;
}
@@ -1365,10 +1452,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
/*
* Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
+ * is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
@@ -1504,6 +1590,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
+ .direct_compaction = true,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1762,4 +1849,225 @@ void compaction_unregister_node(struct node *node)
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+ return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+
+ for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+ classzone_idx) == COMPACT_CONTINUE)
+ return true;
+ }
+
+ return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+ /*
+ * With no special task, compact all zones so that a page of requested
+ * order is allocatable.
+ */
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = pgdat->kcompactd_max_order,
+ .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+
+ };
+ bool success = false;
+
+ trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+ cc.classzone_idx);
+ count_vm_event(KCOMPACTD_WAKE);
+
+ for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
+ int status;
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_deferred(zone, cc.order))
+ continue;
+
+ if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+ COMPACT_CONTINUE)
+ continue;
+
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ if (kthread_should_stop())
+ return;
+ status = compact_zone(zone, &cc);
+
+ if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+ cc.classzone_idx, 0)) {
+ success = true;
+ compaction_defer_reset(zone, cc.order, false);
+ } else if (status == COMPACT_COMPLETE) {
+ /*
+ * We use sync migration mode here, so we defer like
+ * sync direct compaction does.
+ */
+ defer_compaction(zone, cc.order);
+ }
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ /*
+ * Regardless of success, we are done until woken up next. But remember
+ * the requested order/classzone_idx in case it was higher/tighter than
+ * our current ones
+ */
+ if (pgdat->kcompactd_max_order <= cc.order)
+ pgdat->kcompactd_max_order = 0;
+ if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ if (!order)
+ return;
+
+ if (pgdat->kcompactd_max_order < order)
+ pgdat->kcompactd_max_order = order;
+
+ if (pgdat->kcompactd_classzone_idx > classzone_idx)
+ pgdat->kcompactd_classzone_idx = classzone_idx;
+
+ if (!waitqueue_active(&pgdat->kcompactd_wait))
+ return;
+
+ if (!kcompactd_node_suitable(pgdat))
+ return;
+
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+ classzone_idx);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ set_freezable();
+
+ pgdat->kcompactd_max_order = 0;
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+
+ while (!kthread_should_stop()) {
+ trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ wait_event_freezable(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat));
+
+ kcompactd_do_work(pgdat);
+ }
+
+ return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kcompactd)
+ return 0;
+
+ pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ if (IS_ERR(pgdat->kcompactd)) {
+ pr_err("Failed to start kcompactd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kcompactd);
+ pgdat->kcompactd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+ struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+ if (kcompactd) {
+ kthread_stop(kcompactd);
+ NODE_DATA(nid)->kcompactd = NULL;
+ }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ }
+ }
+ return NOTIFY_OK;
+}
+
+static int __init kcompactd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ kcompactd_run(nid);
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+}
+subsys_initcall(kcompactd_init)
+
#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
deleted file mode 100644
index 3b8f1b83610e..000000000000
--- a/mm/debug-pagealloc.c
+++ /dev/null
@@ -1,143 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/page_ext.h>
-#include <linux/poison.h>
-#include <linux/ratelimit.h>
-
-static bool page_poisoning_enabled __read_mostly;
-
-static bool need_page_poisoning(void)
-{
- if (!debug_pagealloc_enabled())
- return false;
-
- return true;
-}
-
-static void init_page_poisoning(void)
-{
- if (!debug_pagealloc_enabled())
- return;
-
- page_poisoning_enabled = true;
-}
-
-struct page_ext_operations page_poisoning_ops = {
- .need = need_page_poisoning,
- .init = init_page_poisoning,
-};
-
-static inline void set_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (!page_ext)
- return;
- __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline void clear_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (!page_ext)
- return;
- __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline bool page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (!page_ext)
- return false;
- return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static void poison_page(struct page *page)
-{
- void *addr = kmap_atomic(page);
-
- set_page_poison(page);
- memset(addr, PAGE_POISON, PAGE_SIZE);
- kunmap_atomic(addr);
-}
-
-static void poison_pages(struct page *page, int n)
-{
- int i;
-
- for (i = 0; i < n; i++)
- poison_page(page + i);
-}
-
-static bool single_bit_flip(unsigned char a, unsigned char b)
-{
- unsigned char error = a ^ b;
-
- return error && !(error & (error - 1));
-}
-
-static void check_poison_mem(unsigned char *mem, size_t bytes)
-{
- static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
- unsigned char *start;
- unsigned char *end;
-
- start = memchr_inv(mem, PAGE_POISON, bytes);
- if (!start)
- return;
-
- for (end = mem + bytes - 1; end > start; end--) {
- if (*end != PAGE_POISON)
- break;
- }
-
- if (!__ratelimit(&ratelimit))
- return;
- else if (start == end && single_bit_flip(*start, PAGE_POISON))
- printk(KERN_ERR "pagealloc: single bit error\n");
- else
- printk(KERN_ERR "pagealloc: memory corruption\n");
-
- print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
- end - start + 1, 1);
- dump_stack();
-}
-
-static void unpoison_page(struct page *page)
-{
- void *addr;
-
- if (!page_poison(page))
- return;
-
- addr = kmap_atomic(page);
- check_poison_mem(addr, PAGE_SIZE);
- clear_page_poison(page);
- kunmap_atomic(addr);
-}
-
-static void unpoison_pages(struct page *page, int n)
-{
- int i;
-
- for (i = 0; i < n; i++)
- unpoison_page(page + i);
-}
-
-void __kernel_map_pages(struct page *page, int numpages, int enable)
-{
- if (!page_poisoning_enabled)
- return;
-
- if (enable)
- unpoison_pages(page, numpages);
- else
- poison_pages(page, numpages);
-}
diff --git a/mm/debug.c b/mm/debug.c
index 689b6e911cae..89ac083ea504 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,6 +9,18 @@
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
+#include <linux/migrate.h>
+#include <linux/page_owner.h>
+
+char *migrate_reason_names[MR_TYPES] = {
+ "compaction",
+ "memory_failure",
+ "memory_hotplug",
+ "syscall_or_cpuset",
+ "mempolicy_mbind",
+ "numa_misplaced",
+ "cma",
+};
static const struct trace_print_flags pageflag_names[] = {
{1UL << PG_locked, "locked" },
@@ -47,6 +59,9 @@ static const struct trace_print_flags pageflag_names[] = {
{1UL << PG_young, "young" },
{1UL << PG_idle, "idle" },
#endif
+#ifdef CONFIG_ZCACHE
+ {1UL << PG_was_active, "was_active" },
+#endif
};
static void dump_flags(unsigned long flags,
@@ -103,6 +118,7 @@ void dump_page_badflags(struct page *page, const char *reason,
void dump_page(struct page *page, const char *reason)
{
dump_page_badflags(page, reason, 0);
+ dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
diff --git a/mm/filemap.c b/mm/filemap.c
index f217120973eb..6f3c539d9e68 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -239,10 +239,12 @@ void __delete_from_page_cache(struct page *page, void *shadow,
* invalidate any existing cleancache entries. We can't leave
* stale data around in the cleancache once our page is gone
*/
- if (PageUptodate(page) && PageMappedToDisk(page))
+ if (PageUptodate(page) && PageMappedToDisk(page)) {
+ count_vm_event(PGPGOUTCLEAN);
cleancache_put_page(page);
- else
+ } else {
cleancache_invalidate_page(mapping, page);
+ }
page_cache_tree_delete(mapping, page, shadow);
@@ -395,19 +397,17 @@ static int __filemap_fdatawait_range(struct address_space *mapping,
goto out;
pagevec_init(&pvec, 0);
- while ((index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+ while (index <= end) {
unsigned i;
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_WRITEBACK);
+ if (!nr_pages)
+ break;
+
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- /* until radix tree lookup accepts end_index */
- if (page->index > end)
- continue;
-
wait_on_page_writeback(page);
if (TestClearPageError(page))
ret = -EIO;
@@ -690,11 +690,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -817,6 +817,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -884,18 +885,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1425,9 +1428,10 @@ repeat:
EXPORT_SYMBOL(find_get_pages_contig);
/**
- * find_get_pages_tag - find and return pages that match @tag
+ * find_get_pages_range_tag - find and return pages in given range matching @tag
* @mapping: the address_space to search
* @index: the starting page index
+ * @end: The final page index (inclusive)
* @tag: the tag index
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
@@ -1435,8 +1439,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
*/
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
- int tag, unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+ pgoff_t end, int tag, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1450,6 +1455,9 @@ restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1488,18 +1496,28 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *index = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when we got at @end. We take care to not overflow the
+ * index @index as it confuses some of the callers. This breaks the
+ * iteration when there is page at index -1 but that is already broken
+ * anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *index = (pgoff_t)-1;
+ else
+ *index = end + 1;
+out:
rcu_read_unlock();
- if (ret)
- *index = pages[ret - 1]->index + 1;
-
return ret;
}
-EXPORT_SYMBOL(find_get_pages_tag);
+EXPORT_SYMBOL(find_get_pages_range_tag);
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
diff --git a/mm/internal.h b/mm/internal.h
index d83afc995a49..184e2caa9613 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -206,6 +206,8 @@ extern void prep_compound_page(struct page *page, unsigned int order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
+extern void post_alloc_hook(struct page *page, unsigned int order,
+ gfp_t gfp_flags);
extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -230,6 +232,7 @@ struct compact_control {
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool direct_compaction; /* False from kcompactd or /proc/... */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const int alloc_flags; /* alloc flags of a direct compactor */
@@ -247,9 +250,9 @@ isolate_freepages_range(struct compact_control *cc,
unsigned long
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal);
-
+int find_suitable_fallback(struct free_area *area, unsigned int current_order,
+ int migratetype, bool only_stealable,
+ int start_order, bool *can_steal);
#endif
/*
@@ -334,10 +337,8 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
-#endif
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index e1100433cefe..2976a9ee104f 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,4 +1,5 @@
KASAN_SANITIZE := n
+UBSAN_SANITIZE_kasan.o := n
KCOV_INSTRUMENT := n
CFLAGS_REMOVE_kasan.o = -pg
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d4f13525e42e..ff0390823e04 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -223,8 +223,18 @@ static unsigned long jiffies_min_age;
static unsigned long jiffies_last_scan;
/* delay between automatic memory scannings */
static signed long jiffies_scan_wait;
-/* enables or disables the task stacks scanning */
+
+/* Enables or disables the task stacks scanning.
+ * Set to 1 if at compile time we want it enabled.
+ * Else set to 0 to have it disabled by default.
+ * This can be enabled by writing to "stack=on" using
+ * kmemleak debugfs entry.*/
+#ifdef CONFIG_DEBUG_TASK_STACK_SCAN_OFF
+static int kmemleak_stack_scan;
+#else
static int kmemleak_stack_scan = 1;
+#endif
+
/* protects the memory scanning, parameters and debug/kmemleak file access */
static DEFINE_MUTEX(scan_mutex);
/* setting kmemleak=on, will set this var, skipping the disable */
diff --git a/mm/ksm.c b/mm/ksm.c
index f51613052aee..3cd071cdfbb3 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -37,6 +37,7 @@
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/show_mem_notifier.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -223,6 +224,9 @@ static unsigned int ksm_thread_pages_to_scan = 100;
/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;
+/* Boolean to indicate whether to use deferred timer or not */
+static bool use_deferred_timer;
+
#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
@@ -236,7 +240,7 @@ static int ksm_nr_node_ids = 1;
#define KSM_RUN_MERGE 1
#define KSM_RUN_UNMERGE 2
#define KSM_RUN_OFFLINE 4
-static unsigned long ksm_run = KSM_RUN_STOP;
+static unsigned long ksm_run = KSM_RUN_MERGE;
static void wait_while_offlining(void);
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
@@ -247,6 +251,20 @@ static DEFINE_SPINLOCK(ksm_mmlist_lock);
sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL)
+static int ksm_show_mem_notifier(struct notifier_block *nb,
+ unsigned long action,
+ void *data)
+{
+ pr_info("ksm_pages_sharing: %lu\n", ksm_pages_sharing);
+ pr_info("ksm_pages_shared: %lu\n", ksm_pages_shared);
+
+ return 0;
+}
+
+static struct notifier_block ksm_show_mem_notifier_block = {
+ .notifier_call = ksm_show_mem_notifier,
+};
+
static int __init ksm_slab_init(void)
{
rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
@@ -541,8 +559,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
void *expected_mapping;
unsigned long kpfn;
- expected_mapping = (void *)stable_node +
- (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+ expected_mapping = (void *)((unsigned long)stable_node |
+ PAGE_MAPPING_KSM);
again:
kpfn = READ_ONCE(stable_node->kpfn);
page = pfn_to_page(kpfn);
@@ -1753,6 +1771,41 @@ static void ksm_do_scan(unsigned int scan_npages)
}
}
+static void process_timeout(unsigned long __data)
+{
+ wake_up_process((struct task_struct *)__data);
+}
+
+static signed long __sched deferred_schedule_timeout(signed long timeout)
+{
+ struct timer_list timer;
+ unsigned long expire;
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ if (timeout < 0) {
+ pr_err("schedule_timeout: wrong timeout value %lx\n",
+ timeout);
+ __set_current_state(TASK_RUNNING);
+ goto out;
+ }
+
+ expire = timeout + jiffies;
+
+ setup_deferrable_timer_on_stack(&timer, process_timeout,
+ (unsigned long)current);
+ mod_timer(&timer, expire);
+ schedule();
+ del_singleshot_timer_sync(&timer);
+
+ /* Remove the timer from the object tracker */
+ destroy_timer_on_stack(&timer);
+
+ timeout = expire - jiffies;
+
+out:
+ return timeout < 0 ? 0 : timeout;
+}
+
static int ksmd_should_run(void)
{
return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
@@ -1773,7 +1826,11 @@ static int ksm_scan_thread(void *nothing)
try_to_freeze();
if (ksmd_should_run()) {
- schedule_timeout_interruptible(
+ if (use_deferred_timer)
+ deferred_schedule_timeout(
+ msecs_to_jiffies(ksm_thread_sleep_millisecs));
+ else
+ schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
wait_event_freezable(ksm_thread_wait,
@@ -1932,7 +1989,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
@@ -1956,6 +2013,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
stable_node = page_stable_node(page);
if (!stable_node)
return ret;
+
again:
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -2225,6 +2283,26 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
}
KSM_ATTR(run);
+static ssize_t deferred_timer_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, 8, "%d\n", use_deferred_timer);
+}
+
+static ssize_t deferred_timer_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long enable;
+ int err;
+
+ err = kstrtoul(buf, 10, &enable);
+ use_deferred_timer = enable;
+
+ return count;
+}
+KSM_ATTR(deferred_timer);
+
#ifdef CONFIG_NUMA
static ssize_t merge_across_nodes_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -2337,6 +2415,7 @@ static struct attribute *ksm_attrs[] = {
&pages_unshared_attr.attr,
&pages_volatile_attr.attr,
&full_scans_attr.attr,
+ &deferred_timer_attr.attr,
#ifdef CONFIG_NUMA
&merge_across_nodes_attr.attr,
#endif
@@ -2381,6 +2460,8 @@ static int __init ksm_init(void)
/* There is no significance to this priority 100 */
hotplug_memory_notifier(ksm_memory_callback, 100);
#endif
+
+ show_mem_notifier_register(&ksm_show_mem_notifier_block);
return 0;
out_free:
diff --git a/mm/maccess.c b/mm/maccess.c
index 18717e893a75..03ea550f5a74 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -170,8 +170,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
pagefault_disable();
do {
- ret = __copy_from_user_inatomic(dst++,
- (const void __user __force *)src++, 1);
+ ret = __get_user(*dst++, (const char __user __force *)src++);
} while (dst[-1] && ret == 0 && src - unsafe_addr < count);
dst[-1] = '\0';
diff --git a/mm/memblock.c b/mm/memblock.c
index 99c7f493d45f..e39ef2fe5c17 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -19,6 +19,9 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
+#include <linux/preempt.h>
+#include <linux/seqlock.h>
+#include <linux/irqflags.h>
#include <asm-generic/sections.h>
#include <linux/io.h>
@@ -31,6 +34,7 @@ static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIO
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
#endif
+static seqcount_t memblock_seq;
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
@@ -733,7 +737,8 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
(unsigned long long)base + size - 1,
(void *)_RET_IP_);
- kmemleak_free_part(__va(base), size);
+ if (base < memblock.current_limit)
+ kmemleak_free_part(__va(base), size);
return memblock_remove_range(&memblock.reserved, base, size);
}
@@ -834,6 +839,16 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
}
/**
+ * memblock_clear_nomap - Clear a flag of MEMBLOCK_NOMAP memory region
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ */
+int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP);
+}
+
+/**
* __next_reserved_mem_region - next function for for_each_reserved_region()
* @idx: pointer to u64 loop variable
* @out_start: ptr to phys_addr_t for start address of the region, can be %NULL
@@ -1169,7 +1184,8 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
* The min_count is set to 0 so that memblock allocations are
* never reported as leaks.
*/
- kmemleak_alloc(__va(found), size, 0, 0);
+ if (found < memblock.current_limit)
+ kmemleak_alloc(__va(found), size, 0, 0);
return found;
}
return 0;
@@ -1509,7 +1525,7 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
(phys_addr_t)ULLONG_MAX);
}
-static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+static int __init_memblock __memblock_search(struct memblock_type *type, phys_addr_t addr)
{
unsigned int left = 0, right = type->cnt;
@@ -1527,6 +1543,19 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
return -1;
}
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+ int ret;
+ unsigned long seq;
+
+ do {
+ seq = raw_read_seqcount_begin(&memblock_seq);
+ ret = __memblock_search(type, addr);
+ } while (unlikely(read_seqcount_retry(&memblock_seq, seq)));
+
+ return ret;
+}
+
int __init memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
@@ -1585,6 +1614,14 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
memblock.memory.regions[idx].size) >= end;
}
+bool __init_memblock memblock_overlaps_memory(phys_addr_t base,
+ phys_addr_t size)
+{
+ memblock_cap_size(base, &size);
+
+ return memblock_overlaps_region(&memblock.memory, base, size);
+}
+
/**
* memblock_is_region_reserved - check if a region intersects reserved memory
* @base: base of region to check
@@ -1701,6 +1738,37 @@ void __init memblock_allow_resize(void)
memblock_can_resize = 1;
}
+static unsigned long __init_memblock
+memblock_resize_late(int begin, unsigned long flags)
+{
+ static int memblock_can_resize_old;
+
+ if (begin) {
+ preempt_disable();
+ local_irq_save(flags);
+ memblock_can_resize_old = memblock_can_resize;
+ memblock_can_resize = 0;
+ raw_write_seqcount_begin(&memblock_seq);
+ } else {
+ raw_write_seqcount_end(&memblock_seq);
+ memblock_can_resize = memblock_can_resize_old;
+ local_irq_restore(flags);
+ preempt_enable();
+ }
+
+ return flags;
+}
+
+unsigned long __init_memblock memblock_region_resize_late_begin(void)
+{
+ return memblock_resize_late(1, 0);
+}
+
+void __init_memblock memblock_region_resize_late_end(unsigned long flags)
+{
+ memblock_resize_late(0, flags);
+}
+
static int __init early_memblock(char *p)
{
if (p && strstr(p, "debug"))
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 92a647957f91..d5d9839ede9b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1016,7 +1016,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- ret = try_to_unmap(hpage, ttu);
+ ret = try_to_unmap(hpage, ttu, NULL);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
@@ -1173,7 +1173,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
diff --git a/mm/memory.c b/mm/memory.c
index fa752df6dc85..5dfc9fac8b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2665,7 +2665,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
swap_free(entry);
- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ if ((PageSwapCache(page) && vm_swap_full(page_swap_info(page))) ||
+ (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache) {
@@ -2880,7 +2881,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
}
static unsigned long fault_around_bytes __read_mostly =
- rounddown_pow_of_two(65536);
+ rounddown_pow_of_two(4096);
#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 50bdb897ab00..aa03a24aee9f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -32,6 +32,7 @@
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
+#include <linux/compaction.h>
#include <linux/rmap.h>
#include <asm/tlbflush.h>
@@ -45,7 +46,7 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page);
+static int generic_online_page(struct page *page);
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -857,11 +858,12 @@ void __online_page_free(struct page *page)
}
EXPORT_SYMBOL_GPL(__online_page_free);
-static void generic_online_page(struct page *page)
+static int generic_online_page(struct page *page)
{
__online_page_set_limits(page);
__online_page_increment_counters(page);
__online_page_free(page);
+ return 0;
}
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
@@ -870,11 +872,13 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
unsigned long i;
unsigned long onlined_pages = *(unsigned long *)arg;
struct page *page;
+ int ret;
if (PageReserved(pfn_to_page(start_pfn)))
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(start_pfn + i);
- (*online_page_callback)(page);
- onlined_pages++;
+ ret = (*online_page_callback)(page);
+ if (!ret)
+ onlined_pages++;
}
*(unsigned long *)arg = onlined_pages;
return 0;
@@ -1013,7 +1017,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = pfn_to_nid(pfn);
+ nid = zone_to_nid(zone);
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
@@ -1053,7 +1057,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
pgdat_resize_unlock(zone->zone_pgdat, &flags);
if (onlined_pages) {
- node_states_set_node(zone_to_nid(zone), &arg);
+ node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL, NULL);
else
@@ -1064,8 +1068,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
init_per_zone_wmark_min();
- if (onlined_pages)
- kswapd_run(zone_to_nid(zone));
+ if (onlined_pages) {
+ kswapd_run(nid);
+ kcompactd_run(nid);
+ }
vm_total_pages = nr_free_pagecache_pages();
@@ -1422,10 +1428,10 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
}
/*
- * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
- * and hugepages). We scan pfn because it's much easier than scanning over
- * linked list. This function returns the pfn of the first found movable
- * page if it's found, otherwise 0.
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
+ * non-lru movable pages and hugepages). We scan pfn because it's much
+ * easier than scanning over linked list. This function returns the pfn
+ * of the first found movable page if it's found, otherwise 0.
*/
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
{
@@ -1436,6 +1442,8 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
page = pfn_to_page(pfn);
if (PageLRU(page))
return pfn;
+ if (__PageMovable(page))
+ return pfn;
if (PageHuge(page)) {
if (page_huge_active(page))
return pfn;
@@ -1487,29 +1495,31 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (WARN_ON(PageLRU(page)))
isolate_lru_page(page);
if (page_mapped(page))
- try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS, NULL);
continue;
}
if (!get_page_unless_zero(page))
continue;
/*
- * We can skip free pages. And we can only deal with pages on
- * LRU.
+ * We can skip free pages. And we can deal with pages on
+ * LRU and non-lru movable pages.
*/
- ret = isolate_lru_page(page);
+ if (PageLRU(page))
+ ret = isolate_lru_page(page);
+ else
+ ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
if (!ret) { /* Success */
put_page(page);
list_add_tail(&page->lru, &source);
move_pages--;
- inc_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
-
+ if (!__PageMovable(page))
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
} else {
#ifdef CONFIG_DEBUG_VM
- printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
- pfn);
- dump_page(page, "failed to remove from LRU");
+ pr_alert("failed to isolate pfn %lx\n", pfn);
+ dump_page(page, "isolation failed");
#endif
put_page(page);
/* Because we don't have big zone->lock. we should
@@ -1862,8 +1872,10 @@ repeat:
zone_pcp_update(zone);
node_states_clear_node(node, &arg);
- if (arg.status_change_nid >= 0)
+ if (arg.status_change_nid >= 0) {
kswapd_stop(node);
+ kcompactd_stop(node);
+ }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
diff --git a/mm/memtest.c b/mm/memtest.c
index 8eaa4c3a5f65..15a423eb0c29 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -80,8 +80,8 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
}
/* default is disabled */
-static unsigned int memtest_pattern __initdata;
-
+static unsigned int memtest_pattern __initdata =
+ CONFIG_MEMTEST_ENABLE_DEFAULT;
static int __init parse_memtest(char *arg)
{
int ret = 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index 73da75d5e5b2..d8d8c2a3f530 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -31,6 +31,7 @@
#include <linux/vmalloc.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
+#include <linux/compaction.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
@@ -38,6 +39,7 @@
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/page_owner.h>
#include <linux/ptrace.h>
#include <asm/tlbflush.h>
@@ -73,6 +75,81 @@ int migrate_prep_local(void)
return 0;
}
+int isolate_movable_page(struct page *page, isolate_mode_t mode)
+{
+ struct address_space *mapping;
+
+ /*
+ * Avoid burning cycles with pages that are yet under __free_pages(),
+ * or just got freed under us.
+ *
+ * In case we 'win' a race for a movable page being freed under us and
+ * raise its refcount preventing __free_pages() from doing its job
+ * the put_page() at the end of this block will take care of
+ * release this page, thus avoiding a nasty leakage.
+ */
+ if (unlikely(!get_page_unless_zero(page)))
+ goto out;
+
+ /*
+ * Check PageMovable before holding a PG_lock because page's owner
+ * assumes anybody doesn't touch PG_lock of newly allocated page
+ * so unconditionally grapping the lock ruins page's owner side.
+ */
+ if (unlikely(!__PageMovable(page)))
+ goto out_putpage;
+ /*
+ * As movable pages are not isolated from LRU lists, concurrent
+ * compaction threads can race against page migration functions
+ * as well as race against the releasing a page.
+ *
+ * In order to avoid having an already isolated movable page
+ * being (wrongly) re-isolated while it is under migration,
+ * or to avoid attempting to isolate pages being released,
+ * lets be sure we have the page lock
+ * before proceeding with the movable page isolation steps.
+ */
+ if (unlikely(!trylock_page(page)))
+ goto out_putpage;
+
+ if (!PageMovable(page) || PageIsolated(page))
+ goto out_no_isolated;
+
+ mapping = page_mapping(page);
+ VM_BUG_ON_PAGE(!mapping, page);
+
+ if (!mapping->a_ops->isolate_page(page, mode))
+ goto out_no_isolated;
+
+ /* Driver shouldn't use PG_isolated bit of page->flags */
+ WARN_ON_ONCE(PageIsolated(page));
+ __SetPageIsolated(page);
+ unlock_page(page);
+
+ return 0;
+
+out_no_isolated:
+ unlock_page(page);
+out_putpage:
+ put_page(page);
+out:
+ return -EBUSY;
+}
+
+/* It should be called on page which is PG_movable */
+void putback_movable_page(struct page *page)
+{
+ struct address_space *mapping;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ mapping = page_mapping(page);
+ mapping->a_ops->putback_page(page);
+ __ClearPageIsolated(page);
+}
+
/*
* Put previously isolated pages back onto the appropriate lists
* from where they were once taken off for compaction/migration.
@@ -94,10 +171,23 @@ void putback_movable_pages(struct list_head *l)
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (unlikely(isolated_balloon_page(page)))
- balloon_page_putback(page);
- else
+ /*
+ * We isolated non-lru movable page so here we can use
+ * __PageMovable because LRU page's mapping cannot have
+ * PAGE_MAPPING_MOVABLE.
+ */
+ if (unlikely(__PageMovable(page))) {
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ lock_page(page);
+ if (PageMovable(page))
+ putback_movable_page(page);
+ else
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ put_page(page);
+ } else {
putback_lru_page(page);
+ }
}
}
@@ -580,6 +670,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
*/
if (PageWriteback(newpage))
end_page_writeback(newpage);
+
+ copy_page_owner(page, newpage);
}
EXPORT_SYMBOL(migrate_page_copy);
@@ -588,7 +680,7 @@ EXPORT_SYMBOL(migrate_page_copy);
***********************************************************/
/*
- * Common logic to directly migrate a single page suitable for
+ * Common logic to directly migrate a single LRU page suitable for
* pages that do not use PagePrivate/PagePrivate2.
*
* Pages are locked upon entry and exit.
@@ -751,24 +843,47 @@ static int move_to_new_page(struct page *newpage, struct page *page,
enum migrate_mode mode)
{
struct address_space *mapping;
- int rc;
+ int rc = -EAGAIN;
+ bool is_lru = !__PageMovable(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
mapping = page_mapping(page);
- if (!mapping)
- rc = migrate_page(mapping, newpage, page, mode);
- else if (mapping->a_ops->migratepage)
+
+ if (likely(is_lru)) {
+ if (!mapping)
+ rc = migrate_page(mapping, newpage, page, mode);
+ else if (mapping->a_ops->migratepage)
+ /*
+ * Most pages have a mapping and most filesystems
+ * provide a migratepage callback. Anonymous pages
+ * are part of swap space which also has its own
+ * migratepage callback. This is the most common path
+ * for page migration.
+ */
+ rc = mapping->a_ops->migratepage(mapping, newpage,
+ page, mode);
+ else
+ rc = fallback_migrate_page(mapping, newpage,
+ page, mode);
+ } else {
/*
- * Most pages have a mapping and most filesystems provide a
- * migratepage callback. Anonymous pages are part of swap
- * space which also has its own migratepage callback. This
- * is the most common path for page migration.
+ * In case of non-lru page, it could be released after
+ * isolation step. In that case, we shouldn't try migration.
*/
- rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
- else
- rc = fallback_migrate_page(mapping, newpage, page, mode);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ if (!PageMovable(page)) {
+ rc = MIGRATEPAGE_SUCCESS;
+ __ClearPageIsolated(page);
+ goto out;
+ }
+
+ rc = mapping->a_ops->migratepage(mapping, newpage,
+ page, mode);
+ WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
+ !PageIsolated(page));
+ }
/*
* When successful, old pagecache page->mapping must be cleared before
@@ -776,9 +891,25 @@ static int move_to_new_page(struct page *newpage, struct page *page,
*/
if (rc == MIGRATEPAGE_SUCCESS) {
set_page_memcg(page, NULL);
- if (!PageAnon(page))
+ if (__PageMovable(page)) {
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ /*
+ * We clear PG_movable under page_lock so any compactor
+ * cannot try to migrate this page.
+ */
+ __ClearPageIsolated(page);
+ }
+
+ /*
+ * Anonymous and movable page->mapping will be cleard by
+ * free_pages_prepare so don't reset it here for keeping
+ * the type to work PageAnon, for example.
+ */
+ if (!PageMappingFlags(page))
page->mapping = NULL;
}
+out:
return rc;
}
@@ -788,6 +919,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
int rc = -EAGAIN;
int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
+ bool is_lru = !__PageMovable(page);
if (!trylock_page(page)) {
if (!force || mode == MIGRATE_ASYNC)
@@ -856,15 +988,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (unlikely(!trylock_page(newpage)))
goto out_unlock;
- if (unlikely(isolated_balloon_page(page))) {
- /*
- * A ballooned page does not need any special attention from
- * physical to virtual reverse mapping procedures.
- * Skip any attempt to unmap PTEs or to remap swap cache,
- * in order to avoid burning cycles at rmap level, and perform
- * the page migration right away (proteced by page lock).
- */
- rc = balloon_page_migrate(newpage, page, mode);
+ if (unlikely(!is_lru)) {
+ rc = move_to_new_page(newpage, page, mode);
goto out_unlock_both;
}
@@ -891,7 +1016,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS, NULL);
page_was_mapped = 1;
}
@@ -910,6 +1035,19 @@ out_unlock:
put_anon_vma(anon_vma);
unlock_page(page);
out:
+ /*
+ * If migration is successful, decrease refcount of the newpage
+ * which will not free the page because new page owner increased
+ * refcounter. As well, if it is LRU page, add the page to LRU
+ * list in here.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ if (unlikely(__PageMovable(newpage)))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+
return rc;
}
@@ -936,7 +1074,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
int rc = MIGRATEPAGE_SUCCESS;
int *result = NULL;
struct page *newpage;
- bool is_lru = !isolated_balloon_page(page);
newpage = get_new_page(page, private, &result);
if (!newpage)
@@ -944,6 +1081,18 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
+ ClearPageActive(page);
+ ClearPageUnevictable(page);
+ if (unlikely(__PageMovable(page))) {
+ lock_page(page);
+ if (!PageMovable(page))
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ }
+ if (put_new_page)
+ put_new_page(newpage, private);
+ else
+ put_page(newpage);
goto out;
}
@@ -952,8 +1101,9 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
- put_new_page = NULL;
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ set_page_owner_migrate_reason(newpage, reason);
+ }
out:
if (rc != -EAGAIN) {
@@ -966,35 +1116,45 @@ out:
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- /* Soft-offlined page shouldn't go through lru cache list */
- if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) {
+ }
+
+ /*
+ * If migration is successful, releases reference grabbed during
+ * isolation. Otherwise, restore the page to right list unless
+ * we want to retry.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ put_page(page);
+ if (reason == MR_MEMORY_FAILURE) {
/*
- * With this release, we free successfully migrated
- * page and set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird, it's how
- * HWPoison flag works at the moment.
+ * Set PG_HWPoison on just freed page
+ * intentionally. Although it's rather weird,
+ * it's how HWPoison flag works at the moment.
*/
- put_page(page);
if (!test_set_page_hwpoison(page))
num_poisoned_pages_inc();
- } else
- putback_lru_page(page);
- }
+ }
+ } else {
+ if (rc != -EAGAIN) {
+ if (likely(!__PageMovable(page))) {
+ putback_lru_page(page);
+ goto put_new;
+ }
- /*
- * If migration was not successful and there's a freeing callback, use
- * it. Otherwise, putback_lru_page() will drop the reference grabbed
- * during isolation. Use the old state of the isolated source page to
- * determine if we migrated a LRU page. newpage was already unlocked
- * and possibly modified by its owner - don't rely on the page state.
- */
- if (put_new_page)
- put_new_page(newpage, private);
- else if (rc == MIGRATEPAGE_SUCCESS && unlikely(!is_lru)) {
- /* drop our reference, page already in the balloon */
- put_page(newpage);
- } else
- putback_lru_page(newpage);
+ lock_page(page);
+ if (PageMovable(page))
+ putback_movable_page(page);
+ else
+ __ClearPageIsolated(page);
+ unlock_page(page);
+ put_page(page);
+ }
+put_new:
+ if (put_new_page)
+ put_new_page(newpage, private);
+ else
+ put_page(newpage);
+ }
if (result) {
if (rc)
@@ -1026,7 +1186,7 @@ out:
static int unmap_and_move_huge_page(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *hpage, int force,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
int *result = NULL;
@@ -1074,7 +1234,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_mapped(hpage)) {
try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS, NULL);
page_was_mapped = 1;
}
@@ -1094,6 +1254,7 @@ put_anon:
if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
put_new_page = NULL;
+ set_page_owner_migrate_reason(new_hpage, reason);
}
out_unlock:
@@ -1155,6 +1316,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
int swapwrite = current->flags & PF_SWAPWRITE;
int rc;
+ trace_mm_migrate_pages_start(mode, reason);
+
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
@@ -1167,7 +1330,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
- pass > 2, mode);
+ pass > 2, mode, reason);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode,
@@ -1784,7 +1947,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1851,6 +2014,7 @@ fail_putback:
set_page_memcg(new_page, page_memcg(page));
set_page_memcg(page, NULL);
page_remove_rmap(page);
+ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1b0c4392f65c..f13fddbfdd29 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -49,6 +49,10 @@
#include <asm/tlb.h>
#include <asm/mmu_context.h>
+#ifdef CONFIG_MSM_APP_SETTINGS
+#include <asm/app_api.h>
+#endif
+
#include "internal.h"
#ifndef arch_mmap_check
@@ -203,6 +207,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
+ * Part of the kernel memory, which can be released
+ * under memory pressure.
+ */
+ free += global_page_state(
+ NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT;
+
+ /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (free <= totalreserve_pages)
@@ -1341,6 +1352,11 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (!len)
return -EINVAL;
+#ifdef CONFIG_MSM_APP_SETTINGS
+ if (use_app_setting)
+ apply_app_setting_bit(file);
+#endif
+
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index e57cf24babd6..a81d521db56a 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -76,7 +76,7 @@ again:
* down, but we are still initializing the system. Pages are given directly
* to the page allocator, no bootmem metadata is updated because it is gone.
*/
-void __init free_bootmem_late(unsigned long addr, unsigned long size)
+void free_bootmem_late(unsigned long addr, unsigned long size)
{
unsigned long cursor, end;
diff --git a/mm/nommu.c b/mm/nommu.c
index b2adb43e4cb4..e5b6dc17f92b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1876,6 +1876,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
+ * Part of the kernel memory, which can be released
+ * under memory pressure.
+ */
+ free += global_page_state(
+ NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT;
+
+ /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
if (free <= totalreserve_pages)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7e5d3467d6d2..1ba63d3477cb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -350,7 +350,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
* State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
* swapents, oom_score_adj value, and name.
*/
-static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
+void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 698806914be7..fe48dba19d36 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
unsigned long nr_pages;
nr_pages = zone_page_state(zone, NR_FREE_PAGES);
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+ /*
+ * Pages reserved for the kernel should not be considered
+ * dirtyable, to prevent a situation where reclaim has to
+ * clean pages in order to balance the zones.
+ */
+ nr_pages -= min(nr_pages, zone->totalreserve_pages);
nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
@@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void)
unsigned long x;
x = global_page_state(NR_FREE_PAGES);
- x -= min(x, dirty_balance_reserve);
+ /*
+ * Pages reserved for the kernel should not be considered
+ * dirtyable, to prevent a situation where reclaim has to
+ * clean pages in order to balance the zones.
+ */
+ x -= min(x, totalreserve_pages);
x += global_page_state(NR_INACTIVE_FILE);
x += global_page_state(NR_ACTIVE_FILE);
@@ -1944,6 +1954,12 @@ void throttle_vm_writeout(gfp_t gfp_mask)
if (global_page_state(NR_UNSTABLE_NFS) +
global_page_state(NR_WRITEBACK) <= dirty_thresh)
break;
+ /* Try safe version */
+ else if (unlikely(global_page_state_snapshot(NR_UNSTABLE_NFS) +
+ global_page_state_snapshot(NR_WRITEBACK) <=
+ dirty_thresh))
+ break;
+
congestion_wait(BLK_RW_ASYNC, HZ/10);
/*
@@ -1978,11 +1994,11 @@ void laptop_mode_timer_fn(unsigned long data)
* We want to write everything out, not just down to the dirty
* threshold
*/
- if (!bdi_has_dirty_io(&q->backing_dev_info))
+ if (!bdi_has_dirty_io(q->backing_dev_info))
return;
rcu_read_lock();
- list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
+ list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
if (wb_has_dirty_io(wb))
wb_start_writeback(wb, nr_pages, true,
WB_REASON_LAPTOP_TIMER);
@@ -2189,30 +2205,14 @@ int write_cache_pages(struct address_space *mapping,
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL), or
- * even swizzled back from swapper_space to tmpfs file
- * mapping. However, page->index will not change
- * because we have a reference on the page.
- */
- if (page->index > end) {
- /*
- * can't be range_cyclic (1st pass) because
- * end == -1 in that case.
- */
- done = 1;
- break;
- }
-
done_index = page->index;
lock_page(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2db537e1da49..1dcb8ff0e681 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,13 +114,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
-/*
- * When calculating the number of globally allowed dirty pages, there
- * is a certain number of per-zone reserves that should not be
- * considered dirtyable memory. This is the sum of those reserves
- * over all existing zones that contribute dirtyable memory.
- */
-unsigned long dirty_balance_reserve __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -230,6 +223,20 @@ static char * const zone_names[MAX_NR_ZONES] = {
};
static void free_compound_page(struct page *page);
+
+char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+ "HighAtomic",
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
@@ -475,6 +482,7 @@ static void bad_page(struct page *page, const char *reason,
printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
dump_page_badflags(page, reason, bad_flags);
+ dump_page_owner(page);
print_modules();
dump_stack();
@@ -521,7 +529,8 @@ void prep_compound_page(struct page *page, unsigned int order)
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
@@ -532,6 +541,9 @@ static int __init early_debug_pagealloc(char *buf)
if (strcmp(buf, "on") == 0)
_debug_pagealloc_enabled = true;
+ if (strcmp(buf, "off") == 0)
+ _debug_pagealloc_enabled = false;
+
return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -1032,9 +1044,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
- kasan_free_pages(page, order);
- if (PageAnon(page))
+ if (PageMappingFlags(page))
page->mapping = NULL;
bad += free_pages_check(page);
for (i = 1; i < (1 << order); i++) {
@@ -1054,7 +1065,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
PAGE_SIZE << order);
}
arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
+ kasan_free_pages(page, order);
return true;
}
@@ -1075,8 +1088,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page,
- unsigned long pfn, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page, unsigned long pfn, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1148,7 +1160,7 @@ static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
#endif
-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
+void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order)
{
if (early_page_uninitialised(pfn))
@@ -1326,6 +1338,11 @@ void __init page_alloc_init_late(void)
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_CMA
+bool is_cma_pageblock(struct page *page)
+{
+ return get_pageblock_migratetype(page) == MIGRATE_CMA;
+}
+
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
@@ -1433,8 +1450,27 @@ static inline int check_new_page(struct page *page)
return 0;
}
+static inline bool free_pages_prezeroed(void)
+{
+ return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled();
+}
+
+inline void post_alloc_hook(struct page *page, unsigned int order,
+ gfp_t gfp_flags)
+{
+ set_page_private(page, 0);
+ set_page_refcounted(page);
+
+ kasan_alloc_pages(page, order);
+ arch_alloc_page(page, order);
+ kernel_map_pages(page, 1 << order, 1);
+ kernel_poison_pages(page, 1 << order, 1);
+ set_page_owner(page, order, gfp_flags);
+}
+
static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
- int alloc_flags)
+ int alloc_flags)
{
int i;
@@ -1444,22 +1480,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
return 1;
}
- set_page_private(page, 0);
- set_page_refcounted(page);
+ post_alloc_hook(page, order, gfp_flags);
- arch_alloc_page(page, order);
- kernel_map_pages(page, 1 << order, 1);
- kasan_alloc_pages(page, order);
-
- if (gfp_flags & __GFP_ZERO)
+ if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
- set_page_owner(page, order, gfp_flags);
-
/*
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
@@ -1522,6 +1551,11 @@ static int fallbacks[MIGRATE_TYPES][4] = {
#endif
};
+int *get_migratetype_fallbacks(int mtype)
+{
+ return fallbacks[mtype];
+}
+
#ifdef CONFIG_CMA
static struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
@@ -1625,7 +1659,8 @@ static void change_pageblock_range(struct page *pageblock_page,
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
*/
-static bool can_steal_fallback(unsigned int order, int start_mt)
+static bool can_steal_fallback(unsigned int current_order, unsigned int start_order,
+ int start_mt, int fallback_mt)
{
/*
* Leaving this order check is intended, although there is
@@ -1634,12 +1669,17 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
* but, below check doesn't guarantee it and that is just heuristic
* so could be changed anytime.
*/
- if (order >= pageblock_order)
+ if (current_order >= pageblock_order)
return true;
- if (order >= pageblock_order / 2 ||
+ /* don't let unmovable allocations cause migrations simply because of free pages */
+ if ((start_mt != MIGRATE_UNMOVABLE && current_order >= pageblock_order / 2) ||
+ /* only steal reclaimable page blocks for unmovable allocations */
+ (start_mt == MIGRATE_UNMOVABLE && fallback_mt != MIGRATE_MOVABLE && current_order >= pageblock_order / 2) ||
+ /* reclaimable can steal aggressively */
start_mt == MIGRATE_RECLAIMABLE ||
- start_mt == MIGRATE_UNMOVABLE ||
+ /* allow unmovable allocs up to 64K without migrating blocks */
+ (start_mt == MIGRATE_UNMOVABLE && start_order >= 5) ||
page_group_by_mobility_disabled)
return true;
@@ -1679,8 +1719,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
* we can steal other freepages all together. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
-int find_suitable_fallback(struct free_area *area, unsigned int order,
- int migratetype, bool only_stealable, bool *can_steal)
+int find_suitable_fallback(struct free_area *area, unsigned int current_order,
+ int migratetype, bool only_stealable,
+ int start_order, bool *can_steal)
{
int i;
int fallback_mt;
@@ -1697,7 +1738,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
if (list_empty(&area->free_list[fallback_mt]))
continue;
- if (can_steal_fallback(order, migratetype))
+ if (can_steal_fallback(current_order, start_order, migratetype, fallback_mt))
*can_steal = true;
if (!only_stealable)
@@ -1833,13 +1874,14 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
- start_migratetype, false, &can_steal);
+ start_migratetype, false, order, &can_steal);
if (fallback_mt == -1)
continue;
page = list_entry(area->free_list[fallback_mt].next,
struct page, lru);
- if (can_steal)
+ if (can_steal &&
+ get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
steal_suitable_fallback(zone, page, start_migratetype);
/* Remove the page from the freelists */
@@ -1878,17 +1920,30 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
- if (migratetype == MIGRATE_MOVABLE)
- page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype);
+ page = __rmqueue_fallback(zone, order, migratetype);
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma(struct zone *zone, unsigned int order)
+{
+ struct page *page = 0;
+ if (IS_ENABLED(CONFIG_CMA))
+ if (!zone->cma_alloc)
+ page = __rmqueue_cma_fallback(zone, order);
+ trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
+ return page;
+}
+#else
+static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order)
+{
+ return NULL;
+}
+#endif
+
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
@@ -1902,7 +1957,17 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order, migratetype, 0);
+ struct page *page;
+
+ /*
+ * If migrate type CMA is being requested only try to
+ * satisfy the request with CMA pages to try and increase
+ * CMA utlization.
+ */
+ if (is_migrate_cma(migratetype))
+ page = __rmqueue_cma(zone, order);
+ else
+ page = __rmqueue(zone, order, migratetype, 0);
if (unlikely(page == NULL))
break;
@@ -1929,6 +1994,28 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
return i;
}
+/*
+ * Return the pcp list that corresponds to the migrate type if that list isn't
+ * empty.
+ * If the list is empty return NULL.
+ */
+static struct list_head *get_populated_pcp_list(struct zone *zone,
+ unsigned int order, struct per_cpu_pages *pcp,
+ int migratetype, int cold)
+{
+ struct list_head *list = &pcp->lists[migratetype];
+
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, order,
+ pcp->batch, list,
+ migratetype, cold);
+
+ if (list_empty(list))
+ list = NULL;
+ }
+ return list;
+}
+
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
@@ -2176,7 +2263,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
void split_page(struct page *page, unsigned int order)
{
int i;
- gfp_t gfp_mask;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2190,12 +2276,9 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- gfp_mask = get_page_owner_gfp(page);
- set_page_owner(page, 0, gfp_mask);
- for (i = 1; i < (1 << order); i++) {
+ for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- set_page_owner(page + i, 0, gfp_mask);
- }
+ split_page_owner(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2213,7 +2296,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
if (!is_migrate_isolate(mt)) {
/* Obey watermarks as if the page was being allocated */
watermark = low_wmark_pages(zone) + (1 << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ if (!is_migrate_cma(mt) &&
+ !zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
__mod_zone_freepage_state(zone, -(1UL << order), mt);
@@ -2224,14 +2308,13 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
- set_page_owner(page, order, __GFP_MOVABLE);
-
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
+ if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
+ && mt != MIGRATE_HIGHATOMIC)
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -2242,33 +2325,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
/*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
- */
-int split_free_page(struct page *page)
-{
- unsigned int order;
- int nr_pages;
-
- order = page_order(page);
-
- nr_pages = __isolate_free_page(page, order);
- if (!nr_pages)
- return 0;
-
- /* Split into individual pages */
- set_page_refcounted(page);
- split_page(page, order);
- return nr_pages;
-}
-
-/*
* Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
@@ -2277,21 +2333,32 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
gfp_t gfp_flags, int alloc_flags, int migratetype)
{
unsigned long flags;
- struct page *page;
+ struct page *page = NULL;
bool cold = ((gfp_flags & __GFP_COLD) != 0);
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
- struct list_head *list;
+ struct list_head *list = NULL;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- if (unlikely(list_empty(list)))
+
+ /* First try to get CMA pages */
+ if (migratetype == MIGRATE_MOVABLE &&
+ gfp_flags & __GFP_CMA) {
+ list = get_populated_pcp_list(zone, 0, pcp,
+ get_cma_migrate_type(), cold);
+ }
+
+ if (list == NULL) {
+ /*
+ * Either CMA is not suitable or there are no free CMA
+ * pages.
+ */
+ list = get_populated_pcp_list(zone, 0, pcp,
+ migratetype, cold);
+ if (unlikely(list == NULL) ||
+ unlikely(list_empty(list)))
goto failed;
}
@@ -2324,8 +2391,13 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
+ if (!page && migratetype == MIGRATE_MOVABLE &&
+ gfp_flags & __GFP_CMA)
+ page = __rmqueue_cma(zone, order);
+
if (!page)
page = __rmqueue(zone, order, migratetype, gfp_flags);
+
spin_unlock(&zone->lock);
if (!page)
goto failed;
@@ -2485,6 +2557,14 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
continue;
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
+#ifdef CONFIG_CMA
+ /*
+ * Note that this check is needed only
+ * when MIGRATE_CMA < MIGRATE_PCPTYPES.
+ */
+ if (mt == MIGRATE_CMA)
+ continue;
+#endif
if (!list_empty(&area->free_list[mt]))
return true;
}
@@ -3703,6 +3783,13 @@ long si_mem_available(void)
available += global_page_state(NR_SLAB_RECLAIMABLE) -
min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+ /*
+ * Part of the kernel memory, which can be released under memory
+ * pressure.
+ */
+ available += global_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
+ PAGE_SHIFT;
+
if (available < 0)
available = 0;
return available;
@@ -5311,6 +5398,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -6081,20 +6171,12 @@ static void calculate_totalreserve_pages(void)
if (max > zone->managed_pages)
max = zone->managed_pages;
+
+ zone->totalreserve_pages = max;
+
reserve_pages += max;
- /*
- * Lowmem reserves are not available to
- * GFP_HIGHUSER page cache allocations and
- * kswapd tries to balance zones to their high
- * watermark. As a result, neither should be
- * regarded as dirtyable memory, to prevent a
- * situation where reclaim has to clean pages
- * in order to balance the zones.
- */
- zone->dirty_balance_reserve = max;
}
}
- dirty_balance_reserve = reserve_pages;
totalreserve_pages = reserve_pages;
}
@@ -6629,8 +6711,9 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
* If @count is not zero, it is okay to include less @count unmovable pages
*
* PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
- * expect this function should be exact.
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
bool skip_hwpoisoned_pages)
@@ -6686,6 +6769,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if (skip_hwpoisoned_pages && PageHWPoison(page))
continue;
+ if (__PageMovable(page))
+ continue;
+
if (!PageLRU(page))
found++;
/*
@@ -6855,6 +6941,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
if (ret)
return ret;
+ cc.zone->cma_alloc = 1;
+
ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret)
goto done;
@@ -6913,6 +7001,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
done:
undo_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype);
+ cc.zone->cma_alloc = 0;
return ret;
}
diff --git a/mm/page_ext.c b/mm/page_ext.c
index de1f34c5a2f1..95e9e9090c7a 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -54,9 +54,6 @@
static struct page_ext_operations *page_ext_ops[] = {
&debug_guardpage_ops,
-#ifdef CONFIG_PAGE_POISONING
- &page_poisoning_ops,
-#endif
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
@@ -111,6 +108,9 @@ struct page_ext *lookup_page_ext(struct page *page)
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (unlikely(!base))
return NULL;
@@ -183,8 +183,11 @@ struct page_ext *lookup_page_ext(struct page *page)
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
- if (!section->page_ext)
+ if (!section || !section->page_ext)
return NULL;
return section->page_ext + pfn;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 00c96462cc36..efb6c3c38c01 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -7,6 +7,8 @@
#include <linux/pageblock-flags.h>
#include <linux/memory.h>
#include <linux/hugetlb.h>
+#include <linux/kasan.h>
+#include <linux/page_owner.h>
#include "internal.h"
static int set_migratetype_isolate(struct page *page,
@@ -105,8 +107,6 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
if (pfn_valid_within(page_to_pfn(buddy)) &&
!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
- kernel_map_pages(page, (1 << order), 1);
- set_page_refcounted(page);
isolated_page = page;
}
}
@@ -125,8 +125,10 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
zone->nr_isolate_pageblock--;
out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (isolated_page)
+ if (isolated_page) {
+ post_alloc_hook(page, order, __GFP_MOVABLE);
__free_pages(isolated_page, order);
+ }
}
static inline struct page *
diff --git a/mm/page_owner.c b/mm/page_owner.c
index dd6b9cebf981..10b7f196b005 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,24 @@
#include <linux/bootmem.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
+#include <linux/stackdepot.h>
+
#include "internal.h"
-static bool page_owner_disabled = true;
-bool page_owner_inited __read_mostly;
+/*
+ * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
+ * to use off stack temporal storage
+ */
+#define PAGE_OWNER_STACK_DEPTH (16)
+
+static bool page_owner_disabled =
+ !IS_ENABLED(CONFIG_PAGE_OWNER_ENABLE_DEFAULT);
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
+
+static depot_stack_handle_t dummy_handle;
+static depot_stack_handle_t failure_handle;
static void init_early_allocated_pages(void);
@@ -20,6 +34,9 @@ static int early_page_owner_param(char *buf)
if (strcmp(buf, "on") == 0)
page_owner_disabled = false;
+ if (strcmp(buf, "off") == 0)
+ page_owner_disabled = true;
+
return 0;
}
early_param("page_owner", early_page_owner_param);
@@ -32,12 +49,42 @@ static bool need_page_owner(void)
return true;
}
+static noinline void register_dummy_stack(void)
+{
+ unsigned long entries[4];
+ struct stack_trace dummy;
+
+ dummy.nr_entries = 0;
+ dummy.max_entries = ARRAY_SIZE(entries);
+ dummy.entries = &entries[0];
+ dummy.skip = 0;
+
+ save_stack_trace(&dummy);
+ dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
+}
+
+static noinline void register_failure_stack(void)
+{
+ unsigned long entries[4];
+ struct stack_trace failure;
+
+ failure.nr_entries = 0;
+ failure.max_entries = ARRAY_SIZE(entries);
+ failure.entries = &entries[0];
+ failure.skip = 0;
+
+ save_stack_trace(&failure);
+ failure_handle = depot_save_stack(&failure, GFP_KERNEL);
+}
+
static void init_page_owner(void)
{
if (page_owner_disabled)
return;
- page_owner_inited = true;
+ register_dummy_stack();
+ register_failure_stack();
+ static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -59,52 +106,135 @@ void __reset_page_owner(struct page *page, unsigned int order)
}
}
-void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
+static inline bool check_recursive_alloc(struct stack_trace *trace,
+ unsigned long ip)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ int i, count;
+
+ if (!trace->nr_entries)
+ return false;
+
+ for (i = 0, count = 0; i < trace->nr_entries; i++) {
+ if (trace->entries[i] == ip && ++count == 2)
+ return true;
+ }
+
+ return false;
+}
+static noinline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
struct stack_trace trace = {
.nr_entries = 0,
- .max_entries = ARRAY_SIZE(page_ext->trace_entries),
- .entries = &page_ext->trace_entries[0],
- .skip = 3,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 2
};
+ depot_stack_handle_t handle;
+
+ save_stack_trace(&trace);
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries-1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ /*
+ * We need to check recursion here because our request to stackdepot
+ * could trigger memory allocation to save new entry. New memory
+ * allocation would reach here and call depot_save_stack() again
+ * if we don't catch it. There is still not enough memory in stackdepot
+ * so it would try to allocate memory again and loop forever.
+ */
+ if (check_recursive_alloc(&trace, _RET_IP_))
+ return dummy_handle;
+
+ handle = depot_save_stack(&trace, flags);
+ if (!handle)
+ handle = failure_handle;
+
+ return handle;
+}
+
+noinline void __set_page_owner(struct page *page, unsigned int order,
+ gfp_t gfp_mask)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
return;
- save_stack_trace(&trace);
-
+ page_ext->handle = save_stack(gfp_mask);
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
- page_ext->nr_entries = trace.nr_entries;
+ page_ext->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
-gfp_t __get_page_owner_gfp(struct page *page)
+void __set_page_owner_migrate_reason(struct page *page, int reason)
{
struct page_ext *page_ext = lookup_page_ext(page);
if (unlikely(!page_ext))
+ return;
+
+ page_ext->last_migrate_reason = reason;
+}
+
+void __split_page_owner(struct page *page, unsigned int order)
+{
+ int i;
+ struct page_ext *page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
/*
- * The caller just returns 0 if no valid gfp
- * So return 0 here too.
+ * The caller just returns if no valid gfp
+ * So return here too.
*/
- return 0;
+ return;
+
+ page_ext->order = 0;
+ for (i = 1; i < (1 << order); i++)
+ __copy_page_owner(page, page + i);
+}
+
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ struct page_ext *old_ext = lookup_page_ext(oldpage);
+ struct page_ext *new_ext = lookup_page_ext(newpage);
+
+ if (unlikely(!old_ext || !new_ext))
+ return;
- return page_ext->gfp_mask;
+ new_ext->order = old_ext->order;
+ new_ext->gfp_mask = old_ext->gfp_mask;
+ new_ext->last_migrate_reason = old_ext->last_migrate_reason;
+ new_ext->handle = old_ext->handle;
+
+ /*
+ * We don't clear the bit on the oldpage as it's going to be freed
+ * after migration. Until then, the info can be useful in case of
+ * a bug, and the overal stats will be off a bit only temporarily.
+ * Also, migrate_misplaced_transhuge_page() can still fail the
+ * migration and then we want the oldpage to retain the info. But
+ * in that case we also don't need to explicitly clear the info from
+ * the new page, which will be freed.
+ */
+ __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
}
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
- struct page *page, struct page_ext *page_ext)
+ struct page *page, struct page_ext *page_ext,
+ depot_stack_handle_t handle)
{
int ret;
int pageblock_mt, page_mt;
char *kbuf;
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
struct stack_trace trace = {
- .nr_entries = page_ext->nr_entries,
- .entries = &page_ext->trace_entries[0],
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 0
};
kbuf = kmalloc(count, GFP_KERNEL);
@@ -112,8 +242,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask 0x%x\n",
- page_ext->order, page_ext->gfp_mask);
+ "Page allocated via order %u, mask %#x(%pGg)\n",
+ page_ext->order, page_ext->gfp_mask,
+ &page_ext->gfp_mask);
if (ret >= count)
goto err;
@@ -122,31 +253,29 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pfnblock_migratetype(page, pfn);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
+ migratetype_names[page_mt],
pfn >> pageblock_order,
- pageblock_mt,
- pageblock_mt != page_mt ? "Fallback" : " ",
- PageLocked(page) ? "K" : " ",
- PageError(page) ? "E" : " ",
- PageReferenced(page) ? "R" : " ",
- PageUptodate(page) ? "U" : " ",
- PageDirty(page) ? "D" : " ",
- PageLRU(page) ? "L" : " ",
- PageActive(page) ? "A" : " ",
- PageSlab(page) ? "S" : " ",
- PageWriteback(page) ? "W" : " ",
- PageCompound(page) ? "C" : " ",
- PageSwapCache(page) ? "B" : " ",
- PageMappedToDisk(page) ? "M" : " ");
+ migratetype_names[pageblock_mt],
+ page->flags, &page->flags);
if (ret >= count)
goto err;
+ depot_fetch_stack(handle, &trace);
ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
if (ret >= count)
goto err;
+ if (page_ext->last_migrate_reason != -1) {
+ ret += snprintf(kbuf + ret, count - ret,
+ "Page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+ if (ret >= count)
+ goto err;
+ }
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -162,14 +291,58 @@ err:
return -ENOMEM;
}
+void __dump_page_owner(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ unsigned long entries[PAGE_OWNER_STACK_DEPTH];
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = PAGE_OWNER_STACK_DEPTH,
+ .skip = 0
+ };
+ depot_stack_handle_t handle;
+ gfp_t gfp_mask;
+ int mt;
+
+ if (unlikely(!page_ext)) {
+ pr_alert("There is not page extension available.\n");
+ return;
+ }
+ gfp_mask = page_ext->gfp_mask;
+ mt = gfpflags_to_migratetype(gfp_mask);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ handle = READ_ONCE(page_ext->handle);
+ if (!handle) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ depot_fetch_stack(handle, &trace);
+ pr_alert("page allocated via order %u, migratetype %s, "
+ "gfp_mask %#x(%pGg)\n", page_ext->order,
+ migratetype_names[mt], gfp_mask, &gfp_mask);
+ print_stack_trace(&trace, 0);
+
+ if (page_ext->last_migrate_reason != -1)
+ pr_alert("page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+}
+
static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
unsigned long pfn;
struct page *page;
struct page_ext *page_ext;
+ depot_stack_handle_t handle;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
page = NULL;
@@ -216,10 +389,19 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
+ /*
+ * Access to page_ext->handle isn't synchronous so we should
+ * be careful to access it.
+ */
+ handle = READ_ONCE(page_ext->handle);
+ if (!handle)
+ continue;
+
/* Record the next PFN to read in the file offset */
*ppos = (pfn - min_low_pfn) + 1;
- return print_page_owner(buf, count, pfn, page, page_ext);
+ return print_page_owner(buf, count, pfn, page,
+ page_ext, handle);
}
return 0;
@@ -258,6 +440,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
page = pfn_to_page(pfn);
+ if (page_zone(page) != zone)
+ continue;
+
/*
* We are safe to check buddy flag and order, because
* this is init stage and only single thread runs.
@@ -321,7 +506,7 @@ static int __init pageowner_init(void)
{
struct dentry *dentry;
- if (!page_owner_inited) {
+ if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
diff --git a/mm/page_poison.c b/mm/page_poison.c
new file mode 100644
index 000000000000..c8cf230dbfcb
--- /dev/null
+++ b/mm/page_poison.c
@@ -0,0 +1,134 @@
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/page_ext.h>
+#include <linux/poison.h>
+#include <linux/ratelimit.h>
+
+static bool want_page_poisoning __read_mostly
+ = IS_ENABLED(CONFIG_PAGE_POISONING_ENABLE_DEFAULT);
+
+static int early_page_poison_param(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ want_page_poisoning = true;
+ else if (strcmp(buf, "off") == 0)
+ want_page_poisoning = false;
+
+ return 0;
+}
+early_param("page_poison", early_page_poison_param);
+
+bool page_poisoning_enabled(void)
+{
+ /*
+ * Assumes that debug_pagealloc_enabled is set before
+ * free_all_bootmem.
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ return (want_page_poisoning ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled()));
+}
+
+static void poison_page(struct page *page)
+{
+ void *addr = kmap_atomic(page);
+
+ memset(addr, PAGE_POISON, PAGE_SIZE);
+ kunmap_atomic(addr);
+}
+
+static void poison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ poison_page(page + i);
+}
+
+static bool single_bit_flip(unsigned char a, unsigned char b)
+{
+ unsigned char error = a ^ b;
+
+ return error && !(error & (error - 1));
+}
+
+static void check_poison_mem(struct page *page,
+ unsigned char *mem, size_t bytes)
+{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
+ unsigned char *start;
+ unsigned char *end;
+
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
+ return;
+
+ start = memchr_inv(mem, PAGE_POISON, bytes);
+ if (!start)
+ return;
+
+ for (end = mem + bytes - 1; end > start; end--) {
+ if (*end != PAGE_POISON)
+ break;
+ }
+
+ if (!__ratelimit(&ratelimit))
+ return;
+ else if (start == end && single_bit_flip(*start, PAGE_POISON))
+ pr_err("pagealloc: single bit error on page with phys start 0x%lx\n",
+ (unsigned long)page_to_phys(page));
+ else
+ pr_err("pagealloc: memory corruption on page with phys start 0x%lx\n",
+ (unsigned long)page_to_phys(page));
+
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
+ end - start + 1, 1);
+ BUG_ON(PANIC_CORRUPTION);
+ dump_stack();
+}
+
+static void unpoison_page(struct page *page)
+{
+ void *addr;
+
+ addr = kmap_atomic(page);
+ /*
+ * Page poisoning when enabled poisons each and every page
+ * that is freed to buddy. Thus no extra check is done to
+ * see if a page was posioned.
+ */
+ check_poison_mem(page, addr, PAGE_SIZE);
+ kunmap_atomic(addr);
+}
+
+static void unpoison_pages(struct page *page, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ unpoison_page(page + i);
+}
+
+void kernel_poison_pages(struct page *page, int numpages, int enable)
+{
+ if (!page_poisoning_enabled())
+ return;
+
+ if (enable)
+ unpoison_pages(page, numpages);
+ else
+ poison_pages(page, numpages);
+}
+
+#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ /* This function does nothing, all work is done via poison pages */
+}
+#endif
diff --git a/mm/process_reclaim.c b/mm/process_reclaim.c
new file mode 100644
index 000000000000..98e5af190fe0
--- /dev/null
+++ b/mm/process_reclaim.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2015-2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/sort.h>
+#include <linux/oom.h>
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+#include <linux/notifier.h>
+#include <linux/vmpressure.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/process_reclaim.h>
+
+#define MAX_SWAP_TASKS SWAP_CLUSTER_MAX
+
+static void swap_fn(struct work_struct *work);
+DECLARE_WORK(swap_work, swap_fn);
+
+/* User knob to enable/disable process reclaim feature */
+static int enable_process_reclaim;
+module_param_named(enable_process_reclaim, enable_process_reclaim, int,
+ S_IRUGO | S_IWUSR);
+
+/* The max number of pages tried to be reclaimed in a single run */
+int per_swap_size = SWAP_CLUSTER_MAX * 32;
+module_param_named(per_swap_size, per_swap_size, int, S_IRUGO | S_IWUSR);
+
+int reclaim_avg_efficiency;
+module_param_named(reclaim_avg_efficiency, reclaim_avg_efficiency,
+ int, S_IRUGO);
+
+/* The vmpressure region where process reclaim operates */
+static unsigned long pressure_min = 50;
+static unsigned long pressure_max = 90;
+module_param_named(pressure_min, pressure_min, ulong, S_IRUGO | S_IWUSR);
+module_param_named(pressure_max, pressure_max, ulong, S_IRUGO | S_IWUSR);
+
+static short min_score_adj = 360;
+module_param_named(min_score_adj, min_score_adj, short,
+ S_IRUGO | S_IWUSR);
+
+/*
+ * Scheduling process reclaim workqueue unecessarily
+ * when the reclaim efficiency is low does not make
+ * sense. We try to detect a drop in efficiency and
+ * disable reclaim for a time period. This period and the
+ * period for which we monitor a drop in efficiency is
+ * defined by swap_eff_win. swap_opt_eff is the optimal
+ * efficincy used as theshold for this.
+ */
+static int swap_eff_win = 2;
+module_param_named(swap_eff_win, swap_eff_win, int, S_IRUGO | S_IWUSR);
+
+static int swap_opt_eff = 50;
+module_param_named(swap_opt_eff, swap_opt_eff, int, S_IRUGO | S_IWUSR);
+
+static atomic_t skip_reclaim = ATOMIC_INIT(0);
+/* Not atomic since only a single instance of swap_fn run at a time */
+static int monitor_eff;
+
+struct selected_task {
+ struct task_struct *p;
+ int tasksize;
+ short oom_score_adj;
+};
+
+int selected_cmp(const void *a, const void *b)
+{
+ const struct selected_task *x = a;
+ const struct selected_task *y = b;
+ int ret;
+
+ ret = x->tasksize < y->tasksize ? -1 : 1;
+
+ return ret;
+}
+
+static int test_task_flag(struct task_struct *p, int flag)
+{
+ struct task_struct *t = p;
+
+ rcu_read_lock();
+ for_each_thread(p, t) {
+ task_lock(t);
+ if (test_tsk_thread_flag(t, flag)) {
+ task_unlock(t);
+ rcu_read_unlock();
+ return 1;
+ }
+ task_unlock(t);
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static void swap_fn(struct work_struct *work)
+{
+ struct task_struct *tsk;
+ struct reclaim_param rp;
+
+ /* Pick the best MAX_SWAP_TASKS tasks in terms of anon size */
+ struct selected_task selected[MAX_SWAP_TASKS] = {{0, 0, 0},};
+ int si = 0;
+ int i;
+ int tasksize;
+ int total_sz = 0;
+ int total_scan = 0;
+ int total_reclaimed = 0;
+ int nr_to_reclaim;
+ int efficiency;
+
+ rcu_read_lock();
+ for_each_process(tsk) {
+ struct task_struct *p;
+ short oom_score_adj;
+
+ if (tsk->flags & PF_KTHREAD)
+ continue;
+
+ if (test_task_flag(tsk, TIF_MEMDIE))
+ continue;
+
+ p = find_lock_task_mm(tsk);
+ if (!p)
+ continue;
+
+ oom_score_adj = p->signal->oom_score_adj;
+ if (oom_score_adj < min_score_adj) {
+ task_unlock(p);
+ continue;
+ }
+
+ tasksize = get_mm_counter(p->mm, MM_ANONPAGES);
+ task_unlock(p);
+
+ if (tasksize <= 0)
+ continue;
+
+ if (si == MAX_SWAP_TASKS) {
+ sort(&selected[0], MAX_SWAP_TASKS,
+ sizeof(struct selected_task),
+ &selected_cmp, NULL);
+ if (tasksize < selected[0].tasksize)
+ continue;
+ selected[0].p = p;
+ selected[0].oom_score_adj = oom_score_adj;
+ selected[0].tasksize = tasksize;
+ } else {
+ selected[si].p = p;
+ selected[si].oom_score_adj = oom_score_adj;
+ selected[si].tasksize = tasksize;
+ si++;
+ }
+ }
+
+ for (i = 0; i < si; i++)
+ total_sz += selected[i].tasksize;
+
+ /* Skip reclaim if total size is too less */
+ if (total_sz < SWAP_CLUSTER_MAX) {
+ rcu_read_unlock();
+ return;
+ }
+
+ for (i = 0; i < si; i++)
+ get_task_struct(selected[i].p);
+
+ rcu_read_unlock();
+
+ while (si--) {
+ nr_to_reclaim =
+ (selected[si].tasksize * per_swap_size) / total_sz;
+ /* scan atleast a page */
+ if (!nr_to_reclaim)
+ nr_to_reclaim = 1;
+
+ rp = reclaim_task_anon(selected[si].p, nr_to_reclaim);
+
+ trace_process_reclaim(selected[si].tasksize,
+ selected[si].oom_score_adj, rp.nr_scanned,
+ rp.nr_reclaimed, per_swap_size, total_sz,
+ nr_to_reclaim);
+ total_scan += rp.nr_scanned;
+ total_reclaimed += rp.nr_reclaimed;
+ put_task_struct(selected[si].p);
+ }
+
+ if (total_scan) {
+ efficiency = (total_reclaimed * 100) / total_scan;
+
+ if (efficiency < swap_opt_eff) {
+ if (++monitor_eff == swap_eff_win) {
+ atomic_set(&skip_reclaim, swap_eff_win);
+ monitor_eff = 0;
+ }
+ } else {
+ monitor_eff = 0;
+ }
+
+ reclaim_avg_efficiency =
+ (efficiency + reclaim_avg_efficiency) / 2;
+ trace_process_reclaim_eff(efficiency, reclaim_avg_efficiency);
+ }
+}
+
+static int vmpressure_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ unsigned long pressure = action;
+
+ if (!enable_process_reclaim)
+ return 0;
+
+ if (!current_is_kswapd())
+ return 0;
+
+ if (atomic_dec_if_positive(&skip_reclaim) >= 0)
+ return 0;
+
+ if ((pressure >= pressure_min) && (pressure < pressure_max))
+ if (!work_pending(&swap_work))
+ queue_work(system_unbound_wq, &swap_work);
+ return 0;
+}
+
+static struct notifier_block vmpr_nb = {
+ .notifier_call = vmpressure_notifier,
+};
+
+static int __init process_reclaim_init(void)
+{
+ vmpressure_notifier_register(&vmpr_nb);
+ return 0;
+}
+
+static void __exit process_reclaim_exit(void)
+{
+ vmpressure_notifier_unregister(&vmpr_nb);
+}
+
+module_init(process_reclaim_init);
+module_exit(process_reclaim_exit);
diff --git a/mm/readahead.c b/mm/readahead.c
index ba22d7fe0afb..fb1d210dbf05 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -208,12 +208,21 @@ out:
* memory at once.
*/
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+ pgoff_t offset, unsigned long nr_to_read)
{
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct file_ra_state *ra = &filp->f_ra;
+ unsigned long max_pages;
+
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
- nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages);
+ /*
+ * If the request exceeds the readahead window, allow the read to
+ * be up to the optimal hardware IO size
+ */
+ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
+ nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
int err;
@@ -234,6 +243,8 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
/*
* Set the initial window size, round to next power of 2 and square
+ * Small size is not dependant on max value - only a one-page read is regarded
+ * as small.
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
* 1-8 page = 32k initial, > 8 page = 128k initial
@@ -242,7 +253,7 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
unsigned long newsize = roundup_pow_of_two(size);
- if (newsize <= max / 32)
+ if (newsize <= 1)
newsize = newsize * 4;
else if (newsize <= max / 4)
newsize = newsize * 2;
@@ -370,10 +381,19 @@ ondemand_readahead(struct address_space *mapping,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
- unsigned long max = ra->ra_pages;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ unsigned long max_pages = ra->ra_pages;
+ unsigned long add_pages;
pgoff_t prev_offset;
/*
+ * If the request exceeds the readahead window, allow the read to
+ * be up to the optimal hardware IO size
+ */
+ if (req_size > max_pages && bdi->io_pages > max_pages)
+ max_pages = min(req_size, bdi->io_pages);
+
+ /*
* start of file
*/
if (!offset)
@@ -386,7 +406,7 @@ ondemand_readahead(struct address_space *mapping,
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size;
- ra->size = get_next_ra_size(ra, max);
+ ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
@@ -401,16 +421,16 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_hole(mapping, offset + 1, max);
+ start = page_cache_next_hole(mapping, offset + 1, max_pages);
rcu_read_unlock();
- if (!start || start - offset > max)
+ if (!start || start - offset > max_pages)
return 0;
ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
- ra->size = get_next_ra_size(ra, max);
+ ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
@@ -418,7 +438,7 @@ ondemand_readahead(struct address_space *mapping,
/*
* oversize read
*/
- if (req_size > max)
+ if (req_size > max_pages)
goto initial_readahead;
/*
@@ -434,7 +454,7 @@ ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, offset, req_size, max))
+ if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
goto readit;
/*
@@ -445,7 +465,7 @@ ondemand_readahead(struct address_space *mapping,
initial_readahead:
ra->start = offset;
- ra->size = get_init_ra_size(req_size, max);
+ ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
@@ -453,10 +473,17 @@ readit:
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
+ * Take care of maximum IO pages as above.
*/
if (offset == ra->start && ra->size == ra->async_size) {
- ra->async_size = get_next_ra_size(ra, max);
- ra->size += ra->async_size;
+ add_pages = get_next_ra_size(ra, max_pages);
+ if (ra->size + add_pages <= max_pages) {
+ ra->async_size = add_pages;
+ ra->size += add_pages;
+ } else {
+ ra->size = max_pages;
+ ra->async_size = max_pages >> 1;
+ }
}
return ra_submit(ra, mapping, filp);
diff --git a/mm/rmap.c b/mm/rmap.c
index cf733fab230f..59e1c26d1460 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1537,9 +1537,12 @@ static int page_not_mapped(struct page *page)
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
* @flags: action and flags
+ * @vma : target vma for reclaim
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
+ * If @vma is not NULL, this function try to remove @page from only @vma
+ * without peeking all mapped vma for @page.
* Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings
@@ -1547,7 +1550,8 @@ static int page_not_mapped(struct page *page)
* SWAP_FAIL - the page is unswappable
* SWAP_MLOCK - page is mlocked.
*/
-int try_to_unmap(struct page *page, enum ttu_flags flags)
+int try_to_unmap(struct page *page, enum ttu_flags flags,
+ struct vm_area_struct *vma)
{
int ret;
struct rmap_walk_control rwc = {
@@ -1555,6 +1559,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
.arg = (void *)flags,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
+ .target_vma = vma,
};
VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
@@ -1600,6 +1605,7 @@ int try_to_munlock(struct page *page)
.arg = (void *)TTU_MUNLOCK,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
+ .target_vma = NULL,
};
@@ -1661,6 +1667,11 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
+ if (rwc->target_vma) {
+ unsigned long address = vma_address(page, rwc->target_vma);
+ return rwc->rmap_one(page, rwc->target_vma, address, rwc->arg);
+ }
+
anon_vma = rmap_walk_anon_lock(page, rwc);
if (!anon_vma)
return ret;
@@ -1703,6 +1714,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
struct address_space *mapping = page->mapping;
pgoff_t pgoff;
struct vm_area_struct *vma;
+ unsigned long address;
int ret = SWAP_AGAIN;
/*
@@ -1718,6 +1730,12 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
pgoff = page_to_pgoff(page);
i_mmap_lock_read(mapping);
+ if (rwc->target_vma) {
+ address = vma_address(page, rwc->target_vma);
+ ret = rwc->rmap_one(page, rwc->target_vma, address, rwc->arg);
+ goto done;
+ }
+
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
diff --git a/mm/shmem.c b/mm/shmem.c
index 4cce6ff6e9d3..aea18fbf577e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1003,7 +1003,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1195,7 +1195,7 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
diff --git a/mm/showmem.c b/mm/showmem.c
new file mode 100644
index 000000000000..1103a02b2cbd
--- /dev/null
+++ b/mm/showmem.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014-2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/notifier.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+
+ATOMIC_NOTIFIER_HEAD(show_mem_notifier);
+
+int show_mem_notifier_register(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&show_mem_notifier, nb);
+}
+
+int show_mem_notifier_unregister(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&show_mem_notifier, nb);
+}
+
+void show_mem_call_notifiers(void)
+{
+ atomic_notifier_call_chain(&show_mem_notifier, 0, NULL);
+}
+
+static int show_mem_notifier_get(void *dat, u64 *val)
+{
+ show_mem_call_notifiers();
+ *val = 0;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(show_mem_notifier_debug_ops, show_mem_notifier_get,
+ NULL, "%llu\n");
+
+int show_mem_notifier_debugfs_register(void)
+{
+ debugfs_create_file("show_mem_notifier", 0664, NULL, NULL,
+ &show_mem_notifier_debug_ops);
+
+ return 0;
+}
+late_initcall(show_mem_notifier_debugfs_register);
diff --git a/mm/slub.c b/mm/slub.c
index 753fccf31c6e..db726090a5a6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -333,11 +333,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
@@ -685,11 +687,21 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
dump_stack();
}
+#ifdef CONFIG_SLUB_DEBUG_PANIC_ON
+static void slab_panic(const char *cause)
+{
+ panic("%s\n", cause);
+}
+#else
+static inline void slab_panic(const char *cause) {}
+#endif
+
void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason)
{
slab_bug(s, "%s", reason);
print_trailer(s, page, object);
+ slab_panic(reason);
}
static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
@@ -704,6 +716,7 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
slab_bug(s, "%s", buf);
print_page_info(page);
dump_stack();
+ slab_panic("slab error");
}
static void init_object(struct kmem_cache *s, void *object, u8 val)
@@ -725,6 +738,7 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
void *from, void *to)
{
+ slab_panic("object poison overwritten");
slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data);
memset(from, data, to - from);
}
@@ -1577,6 +1591,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
+ kasan_alloc_pages(page, order);
__free_kmem_pages(page, order);
}
@@ -3762,6 +3777,7 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kfree_hook(x);
+ kasan_alloc_pages(page, compound_order(page));
__free_kmem_pages(page, compound_order(page));
return;
}
diff --git a/mm/swap.c b/mm/swap.c
index 39395fb549c0..8e6bcb688779 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1130,15 +1130,25 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
}
EXPORT_SYMBOL(pagevec_lookup);
-unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
- pgoff_t *index, int tag, unsigned nr_pages)
+unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ int tag)
{
- pvec->nr = find_get_pages_tag(mapping, index, tag,
- nr_pages, pvec->pages);
+ pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+ PAGEVEC_SIZE, pvec->pages);
return pagevec_count(pvec);
}
-EXPORT_SYMBOL(pagevec_lookup_tag);
+EXPORT_SYMBOL(pagevec_lookup_range_tag);
+unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ int tag, unsigned max_pages)
+{
+ pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+ min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
+ return pagevec_count(pvec);
+}
+EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
/*
* Perform any setup for the swap system
*/
diff --git a/mm/swap_ratio.c b/mm/swap_ratio.c
new file mode 100644
index 000000000000..eae1fc183104
--- /dev/null
+++ b/mm/swap_ratio.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mm_types.h>
+#include <linux/swapfile.h>
+#include <linux/swap.h>
+
+#define SWAP_RATIO_GROUP_START (SWAP_FLAG_PRIO_MASK - 9) /* 32758 */
+#define SWAP_RATIO_GROUP_END (SWAP_FLAG_PRIO_MASK) /* 32767 */
+#define SWAP_FAST_WRITES (SWAPFILE_CLUSTER * (SWAP_CLUSTER_MAX / 8))
+#define SWAP_SLOW_WRITES SWAPFILE_CLUSTER
+
+/*
+ * The fast/slow swap write ratio.
+ * 100 indicates that all writes should
+ * go to fast swap device.
+ */
+int sysctl_swap_ratio = 100;
+
+/* Enable the swap ratio feature */
+int sysctl_swap_ratio_enable;
+
+static bool is_same_group(struct swap_info_struct *a,
+ struct swap_info_struct *b)
+{
+ if (!sysctl_swap_ratio_enable)
+ return false;
+
+ if (!is_swap_ratio_group(a->prio))
+ return false;
+
+ if (a->prio == b->prio)
+ return true;
+
+ return false;
+}
+
+/* Caller must hold swap_avail_lock */
+static int calculate_write_pending(struct swap_info_struct *si,
+ struct swap_info_struct *n)
+{
+ int ratio = sysctl_swap_ratio;
+
+ if ((ratio < 0) || (ratio > 100))
+ return -EINVAL;
+
+ if (WARN_ON(!(si->flags & SWP_FAST)))
+ return -ENODEV;
+
+ if ((n->flags & SWP_FAST) || !is_same_group(si, n))
+ return -ENODEV;
+
+ si->max_writes = ratio ? SWAP_FAST_WRITES : 0;
+ n->max_writes = ratio ? (SWAP_FAST_WRITES * 100) /
+ ratio - SWAP_FAST_WRITES : SWAP_SLOW_WRITES;
+
+ si->write_pending = si->max_writes;
+ n->write_pending = n->max_writes;
+
+ return 0;
+}
+
+static int swap_ratio_slow(struct swap_info_struct **si)
+{
+ struct swap_info_struct *n = NULL;
+ int ret = 0;
+
+ spin_lock(&(*si)->lock);
+ spin_lock(&swap_avail_lock);
+ if (&(*si)->avail_list == plist_last(&swap_avail_head)) {
+ /* just to make skip work */
+ n = *si;
+ ret = -ENODEV;
+ goto skip;
+ }
+ n = plist_next_entry(&(*si)->avail_list,
+ struct swap_info_struct,
+ avail_list);
+ if (n == *si) {
+ /* No other swap device */
+ ret = -ENODEV;
+ goto skip;
+ }
+
+ spin_unlock(&swap_avail_lock);
+ spin_lock(&n->lock);
+ spin_lock(&swap_avail_lock);
+
+ if ((*si)->flags & SWP_FAST) {
+ if ((*si)->write_pending) {
+ (*si)->write_pending--;
+ goto exit;
+ } else {
+ if ((n->flags & SWP_FAST) || !is_same_group(*si, n)) {
+ /* Should never happen */
+ ret = -ENODEV;
+ } else if (n->write_pending) {
+ /*
+ * Requeue fast device, since there are pending
+ * writes for slow device.
+ */
+ plist_requeue(&(*si)->avail_list,
+ &swap_avail_head);
+ n->write_pending--;
+ spin_unlock(&(*si)->lock);
+ *si = n;
+ goto skip;
+ } else {
+ if (0 > calculate_write_pending(*si, n)) {
+ ret = -ENODEV;
+ goto exit;
+ }
+ /* Restart from fast device */
+ (*si)->write_pending--;
+ }
+ }
+ } else {
+ if (!(n->flags & SWP_FAST) || !is_same_group(*si, n)) {
+ /* Should never happen */
+ ret = -ENODEV;
+ } else if (n->write_pending) {
+ /*
+ * Pending writes for fast device.
+ * We reach here when slow device is swapped on first,
+ * before fast device.
+ */
+ /* requeue slow device to the end */
+ plist_requeue(&(*si)->avail_list, &swap_avail_head);
+ n->write_pending--;
+ spin_unlock(&(*si)->lock);
+ *si = n;
+ goto skip;
+ } else {
+ if ((*si)->write_pending) {
+ (*si)->write_pending--;
+ } else {
+ if (0 > calculate_write_pending(n, *si)) {
+ ret = -ENODEV;
+ goto exit;
+ }
+ n->write_pending--;
+ plist_requeue(&(*si)->avail_list,
+ &swap_avail_head);
+ spin_unlock(&(*si)->lock);
+ *si = n;
+ goto skip;
+ }
+ }
+ }
+exit:
+ spin_unlock(&(*si)->lock);
+skip:
+ spin_unlock(&swap_avail_lock);
+ /* n and si would have got interchanged */
+ spin_unlock(&n->lock);
+ return ret;
+}
+
+bool is_swap_ratio_group(int prio)
+{
+ return ((prio >= SWAP_RATIO_GROUP_START) &&
+ (prio <= SWAP_RATIO_GROUP_END)) ? true : false;
+}
+
+void setup_swap_ratio(struct swap_info_struct *p, int prio)
+{
+ /* Used only if sysctl_swap_ratio_enable is set */
+ if (is_swap_ratio_group(prio)) {
+ if (p->flags & SWP_FAST)
+ p->write_pending = SWAP_FAST_WRITES;
+ else
+ p->write_pending = SWAP_SLOW_WRITES;
+ p->max_writes = p->write_pending;
+ }
+}
+
+int swap_ratio(struct swap_info_struct **si)
+{
+ if (!sysctl_swap_ratio_enable)
+ return -ENODEV;
+
+ if (is_swap_ratio_group((*si)->prio))
+ return swap_ratio_slow(si);
+ else
+ return -ENODEV;
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9e587464e634..f3e92b0fef52 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -97,6 +97,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
if (likely(!error)) {
address_space->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(page, NR_SWAPCACHE);
INC_CACHE_INFO(add_total);
}
spin_unlock_irq(&address_space->tree_lock);
@@ -149,6 +150,7 @@ void __delete_from_swap_cache(struct page *page)
ClearPageSwapCache(page);
address_space->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
+ __dec_zone_page_state(page, NR_SWAPCACHE);
INC_CACHE_INFO(del_total);
}
@@ -354,7 +356,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -368,7 +370,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
@@ -473,7 +475,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long mask;
struct blk_plug plug;
- mask = swapin_nr_pages(offset) - 1;
+ mask = is_swap_fast(entry) ? 0 : swapin_nr_pages(offset) - 1;
if (!mask)
goto skip;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 623c77c1327b..feb9abb8e8cc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -75,8 +75,8 @@ PLIST_HEAD(swap_active_head);
* is held and the locking order requires swap_lock to be taken
* before any swap_info_struct->lock.
*/
-static PLIST_HEAD(swap_avail_head);
-static DEFINE_SPINLOCK(swap_avail_lock);
+PLIST_HEAD(swap_avail_head);
+DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -91,6 +91,26 @@ static inline unsigned char swap_count(unsigned char ent)
return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
}
+bool is_swap_fast(swp_entry_t entry)
+{
+ struct swap_info_struct *p;
+ unsigned long type;
+
+ if (non_swap_entry(entry))
+ return false;
+
+ type = swp_type(entry);
+ if (type >= nr_swapfiles)
+ return false;
+
+ p = swap_info[type];
+
+ if (p->flags & SWP_FAST)
+ return true;
+
+ return false;
+}
+
/* returns 1 if swap entry is freed */
static int
__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
@@ -193,7 +213,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
}
}
-#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256
static inline void cluster_set_flag(struct swap_cluster_info *info,
@@ -564,7 +583,7 @@ checks:
scan_base = offset = si->lowest_bit;
/* reuse swap entry of cache-only swap if not busy. */
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
spin_unlock(&si->lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -604,7 +623,8 @@ scan:
spin_lock(&si->lock);
goto checks;
}
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (vm_swap_full(si) &&
+ si->swap_map[offset] == SWAP_HAS_CACHE) {
spin_lock(&si->lock);
goto checks;
}
@@ -619,7 +639,8 @@ scan:
spin_lock(&si->lock);
goto checks;
}
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (vm_swap_full(si) &&
+ si->swap_map[offset] == SWAP_HAS_CACHE) {
spin_lock(&si->lock);
goto checks;
}
@@ -640,18 +661,39 @@ swp_entry_t get_swap_page(void)
{
struct swap_info_struct *si, *next;
pgoff_t offset;
+ int swap_ratio_off = 0;
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
+lock_and_start:
spin_lock(&swap_avail_lock);
start_over:
plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+
+ if (sysctl_swap_ratio && !swap_ratio_off) {
+ int ret;
+
+ spin_unlock(&swap_avail_lock);
+ ret = swap_ratio(&si);
+ if (0 > ret) {
+ /*
+ * Error. Start again with swap
+ * ratio disabled.
+ */
+ swap_ratio_off = 1;
+ goto lock_and_start;
+ } else {
+ goto start;
+ }
+ }
+
/* requeue si to after same-priority siblings */
plist_requeue(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
+start:
spin_lock(&si->lock);
if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
spin_lock(&swap_avail_lock);
@@ -932,11 +974,25 @@ int reuse_swap_page(struct page *page)
count = page_mapcount(page);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
- if (count == 1 && !PageWriteback(page)) {
+ if (count != 1)
+ goto out;
+ if (!PageWriteback(page)) {
delete_from_swap_cache(page);
SetPageDirty(page);
+ } else {
+ swp_entry_t entry;
+ struct swap_info_struct *p;
+
+ entry.val = page_private(page);
+ p = swap_info_get(entry);
+ if (p->flags & SWP_STABLE_WRITES) {
+ spin_unlock(&p->lock);
+ return false;
+ }
+ spin_unlock(&p->lock);
}
}
+out:
return count <= 1;
}
@@ -1008,7 +1064,8 @@ int free_swap_and_cache(swp_entry_t entry)
* Also recheck PageSwapCache now page is locked (above).
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || vm_swap_full())) {
+ (!page_mapped(page) ||
+ vm_swap_full(page_swap_info(page)))) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
@@ -2481,6 +2538,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -ENOMEM;
goto bad_swap;
}
+
+ if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
+ p->flags |= SWP_STABLE_WRITES;
+
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
int cpu;
@@ -2553,11 +2614,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
}
+ if (p->bdev && blk_queue_fast(bdev_get_queue(p->bdev)))
+ p->flags |= SWP_FAST;
+
mutex_lock(&swapon_mutex);
prio = -1;
- if (swap_flags & SWAP_FLAG_PREFER)
+ if (swap_flags & SWAP_FLAG_PREFER) {
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+ setup_swap_ratio(p, prio);
+ }
enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
diff --git a/mm/truncate.c b/mm/truncate.c
index aeb1adba329e..f6fedb876604 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -63,6 +63,171 @@ unlock:
spin_unlock_irq(&mapping->tree_lock);
}
+static void do_truncate_inode_pages_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend, bool fill_zero)
+{
+ pgoff_t start; /* inclusive */
+ pgoff_t end; /* exclusive */
+ unsigned int partial_start; /* inclusive */
+ unsigned int partial_end; /* exclusive */
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ pgoff_t index;
+ int i;
+
+ cleancache_invalidate_inode(mapping);
+ if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+ return;
+
+ /* Offsets within partial pages */
+ partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+ partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+
+ /*
+ * 'start' and 'end' always covers the range of pages to be fully
+ * truncated. Partial pages are covered with 'partial_start' at the
+ * start of the range and 'partial_end' at the end of the range.
+ * Note that 'end' is exclusive while 'lend' is inclusive.
+ */
+ start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ if (lend == -1)
+ /*
+ * lend == -1 indicates end-of-file so we have to set 'end'
+ * to the highest possible pgoff_t and since the type is
+ * unsigned we're using -1.
+ */
+ end = -1;
+ else
+ end = (lend + 1) >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+ index = start;
+
+ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index >= end)
+ break;
+
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
+ if (!trylock_page(page))
+ continue;
+ WARN_ON(page->index != index);
+ if (PageWriteback(page)) {
+ unlock_page(page);
+ continue;
+ }
+ truncate_inode_page(mapping, page);
+ if (fill_zero)
+ zero_user(page, 0, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ cond_resched();
+ index++;
+ }
+
+ if (partial_start) {
+ struct page *page = find_lock_page(mapping, start - 1);
+
+ if (page) {
+ unsigned int top = PAGE_CACHE_SIZE;
+
+ if (start > end) {
+ /* Truncation within a single page */
+ top = partial_end;
+ partial_end = 0;
+ }
+ wait_on_page_writeback(page);
+ zero_user_segment(page, partial_start, top);
+ cleancache_invalidate_page(mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, partial_start,
+ top - partial_start);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+ if (partial_end) {
+ struct page *page = find_lock_page(mapping, end);
+
+ if (page) {
+ wait_on_page_writeback(page);
+ zero_user_segment(page, 0, partial_end);
+ cleancache_invalidate_page(mapping, page);
+ if (page_has_private(page))
+ do_invalidatepage(page, 0,
+ partial_end);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+ /*
+ * If the truncation happened within a single page no pages
+ * will be released, just zeroed, so we can bail out now.
+ */
+ if (start >= end)
+ return;
+
+ index = start;
+ for ( ; ; ) {
+ cond_resched();
+ if (!pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
+ /* If all gone from start onwards, we're done */
+ if (index == start)
+ break;
+ /* Otherwise restart to make sure all gone */
+ index = start;
+ continue;
+ }
+ if (index == start && indices[0] >= end) {
+ /* All gone out of hole to be punched, we're done */
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ break;
+ }
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+
+ /* We rely upon deletion not changing page->index */
+ index = indices[i];
+ if (index >= end) {
+ /* Restart punch to make sure all gone */
+ index = start - 1;
+ break;
+ }
+
+ if (radix_tree_exceptional_entry(page)) {
+ clear_exceptional_entry(mapping, index, page);
+ continue;
+ }
+
+ lock_page(page);
+ WARN_ON(page->index != index);
+ wait_on_page_writeback(page);
+ truncate_inode_page(mapping, page);
+ if (fill_zero)
+ zero_user(page, 0, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ index++;
+ }
+ cleancache_invalidate_inode(mapping);
+}
+
/**
* do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected
@@ -218,162 +383,43 @@ int invalidate_inode_page(struct page *page)
void truncate_inode_pages_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- pgoff_t start; /* inclusive */
- pgoff_t end; /* exclusive */
- unsigned int partial_start; /* inclusive */
- unsigned int partial_end; /* exclusive */
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
- pgoff_t index;
- int i;
-
- cleancache_invalidate_inode(mapping);
- if (mapping->nrpages == 0 && mapping->nrshadows == 0)
- return;
-
- /* Offsets within partial pages */
- partial_start = lstart & (PAGE_CACHE_SIZE - 1);
- partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
-
- /*
- * 'start' and 'end' always covers the range of pages to be fully
- * truncated. Partial pages are covered with 'partial_start' at the
- * start of the range and 'partial_end' at the end of the range.
- * Note that 'end' is exclusive while 'lend' is inclusive.
- */
- start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (lend == -1)
- /*
- * lend == -1 indicates end-of-file so we have to set 'end'
- * to the highest possible pgoff_t and since the type is
- * unsigned we're using -1.
- */
- end = -1;
- else
- end = (lend + 1) >> PAGE_CACHE_SHIFT;
-
- pagevec_init(&pvec, 0);
- index = start;
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
-
- /* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index >= end)
- break;
-
- if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
- continue;
- }
-
- if (!trylock_page(page))
- continue;
- WARN_ON(page->index != index);
- if (PageWriteback(page)) {
- unlock_page(page);
- continue;
- }
- truncate_inode_page(mapping, page);
- unlock_page(page);
- }
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- cond_resched();
- index++;
- }
-
- if (partial_start) {
- struct page *page = find_lock_page(mapping, start - 1);
- if (page) {
- unsigned int top = PAGE_CACHE_SIZE;
- if (start > end) {
- /* Truncation within a single page */
- top = partial_end;
- partial_end = 0;
- }
- wait_on_page_writeback(page);
- zero_user_segment(page, partial_start, top);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, partial_start,
- top - partial_start);
- unlock_page(page);
- page_cache_release(page);
- }
- }
- if (partial_end) {
- struct page *page = find_lock_page(mapping, end);
- if (page) {
- wait_on_page_writeback(page);
- zero_user_segment(page, 0, partial_end);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, 0,
- partial_end);
- unlock_page(page);
- page_cache_release(page);
- }
- }
- /*
- * If the truncation happened within a single page no pages
- * will be released, just zeroed, so we can bail out now.
- */
- if (start >= end)
- return;
-
- index = start;
- for ( ; ; ) {
- cond_resched();
- if (!pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
- /* If all gone from start onwards, we're done */
- if (index == start)
- break;
- /* Otherwise restart to make sure all gone */
- index = start;
- continue;
- }
- if (index == start && indices[0] >= end) {
- /* All gone out of hole to be punched, we're done */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- break;
- }
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
-
- /* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index >= end) {
- /* Restart punch to make sure all gone */
- index = start - 1;
- break;
- }
-
- if (radix_tree_exceptional_entry(page)) {
- clear_exceptional_entry(mapping, index, page);
- continue;
- }
-
- lock_page(page);
- WARN_ON(page->index != index);
- wait_on_page_writeback(page);
- truncate_inode_page(mapping, page);
- unlock_page(page);
- }
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- index++;
- }
- cleancache_invalidate_inode(mapping);
+ do_truncate_inode_pages_range(mapping, lstart, lend, false);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
/**
+ * truncate_inode_pages_range_fill_zero - truncate range of pages specified by start &
+ * end byte offsets and zero them out
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate (inclusive)
+ *
+ * Truncate the page cache, removing the pages that are between
+ * specified offsets (and zeroing out partial pages
+ * if lstart or lend + 1 is not page aligned).
+ *
+ * Truncate takes two passes - the first pass is nonblocking. It will not
+ * block on page locks and it will not block on writeback. The second pass
+ * will wait. This is to prevent as much IO as possible in the affected region.
+ * The first pass will remove most pages, so the search cost of the second pass
+ * is low.
+ *
+ * We pass down the cache-hot hint to the page freeing code. Even if the
+ * mapping is large, it is probably the case that the final pages are the most
+ * recently touched, and freeing happens in ascending file offset order.
+ *
+ * Note that since ->invalidatepage() accepts range to invalidate
+ * truncate_inode_pages_range is able to handle cases where lend + 1 is not
+ * page aligned properly.
+ */
+void truncate_inode_pages_range_fill_zero(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ do_truncate_inode_pages_range(mapping, lstart, lend, true);
+}
+EXPORT_SYMBOL(truncate_inode_pages_range_fill_zero);
+
+/**
* truncate_inode_pages - truncate *all* the pages from an offset
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
@@ -392,6 +438,27 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
EXPORT_SYMBOL(truncate_inode_pages);
/**
+ * truncate_inode_pages_fill_zero - truncate *all* the pages from an offset
+ * and zero them out
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range. Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
+ */
+void truncate_inode_pages_fill_zero(struct address_space *mapping,
+ loff_t lstart)
+{
+ truncate_inode_pages_range_fill_zero(mapping, lstart, (loff_t)-1);
+}
+EXPORT_SYMBOL(truncate_inode_pages_fill_zero);
+
+
+/**
* truncate_inode_pages_final - truncate *all* pages before inode dies
* @mapping: mapping to truncate
*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 77fee9325a57..497248b93a4c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -182,13 +182,9 @@ retry:
goto out_unlock;
/*
- * Be strict and only allow __mcopy_atomic on userfaultfd
- * registered ranges to prevent userland errors going
- * unnoticed. As far as the VM consistency is concerned, it
- * would be perfectly safe to remove this check, but there's
- * no useful usage for __mcopy_atomic ouside of userfaultfd
- * registered ranges. This is after all why these are ioctls
- * belonging to the userfaultfd and not syscalls.
+ * Check the vma is registered in uffd, this is required to
+ * enforce the VM_MAYWRITE check done at uffd registration
+ * time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
goto out_unlock;
diff --git a/mm/util.c b/mm/util.c
index 4d59b4ffe66b..4b3890dc40e3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -368,9 +368,10 @@ struct address_space *page_mapping(struct page *page)
}
mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ if ((unsigned long)mapping & PAGE_MAPPING_ANON)
return NULL;
- return page->mapping;
+
+ return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
EXPORT_SYMBOL(page_mapping);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e44eb1aeec4a..8884934b4bcb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
-#define VM_LAZY_FREE 0x01
-#define VM_LAZY_FREEING 0x02
#define VM_VM_AREA 0x04
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
/* The vmap cache globals are protected by vmap_area_lock */
@@ -291,6 +290,57 @@ static unsigned long cached_align;
static unsigned long vmap_area_pcpu_hole;
+#ifdef CONFIG_ENABLE_VMALLOC_SAVING
+#define POSSIBLE_VMALLOC_START PAGE_OFFSET
+
+#define VMALLOC_BITMAP_SIZE ((VMALLOC_END - PAGE_OFFSET) >> \
+ PAGE_SHIFT)
+#define VMALLOC_TO_BIT(addr) ((addr - PAGE_OFFSET) >> PAGE_SHIFT)
+#define BIT_TO_VMALLOC(i) (PAGE_OFFSET + i * PAGE_SIZE)
+
+unsigned long total_vmalloc_size;
+unsigned long vmalloc_reserved;
+
+DECLARE_BITMAP(possible_areas, VMALLOC_BITMAP_SIZE);
+
+void mark_vmalloc_reserved_area(void *x, unsigned long size)
+{
+ unsigned long addr = (unsigned long)x;
+
+ bitmap_set(possible_areas, VMALLOC_TO_BIT(addr), size >> PAGE_SHIFT);
+ vmalloc_reserved += size;
+}
+
+int is_vmalloc_addr(const void *x)
+{
+ unsigned long addr = (unsigned long)x;
+
+ if (addr < POSSIBLE_VMALLOC_START || addr >= VMALLOC_END)
+ return 0;
+
+ if (test_bit(VMALLOC_TO_BIT(addr), possible_areas))
+ return 0;
+
+ return 1;
+}
+
+static void calc_total_vmalloc_size(void)
+{
+ total_vmalloc_size = VMALLOC_END - POSSIBLE_VMALLOC_START -
+ vmalloc_reserved;
+}
+#else
+int is_vmalloc_addr(const void *x)
+{
+ unsigned long addr = (unsigned long)x;
+
+ return addr >= VMALLOC_START && addr < VMALLOC_END;
+}
+
+static void calc_total_vmalloc_size(void) { }
+#endif
+EXPORT_SYMBOL(is_vmalloc_addr);
+
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
@@ -363,6 +413,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
+ might_sleep();
+
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!va))
@@ -473,7 +525,7 @@ overflow:
purged = 1;
goto retry;
}
- if (printk_ratelimit())
+ if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
kfree(va);
@@ -576,11 +628,19 @@ static unsigned long lazy_max_pages(void)
log = fls(num_online_cpus());
- return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+ return log * (1UL * CONFIG_VMAP_LAZY_PURGING_FACTOR *
+ 1024 * 1024 / PAGE_SIZE);
}
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/*
+ * Serialize vmap purging. There is no actual criticial section protected
+ * by this look, but we want to avoid concurrent calls for performance
+ * reasons and to make the pcpu_get_vm_areas more deterministic.
+ */
+static DEFINE_MUTEX(vmap_purge_lock);
+
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
@@ -595,65 +655,40 @@ void set_iounmap_nonlazy(void)
/*
* Purges all lazily-freed vmap areas.
- *
- * If sync is 0 then don't purge if there is already a purge in progress.
- * If force_flush is 1, then flush kernel TLBs between *start and *end even
- * if we found no lazy vmap areas to unmap (callers can use this to optimise
- * their own TLB flushing).
- * Returns with *start = min(*start, lowest purged address)
- * *end = max(*end, highest purged address)
*/
-static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
- int sync, int force_flush)
+static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
- static DEFINE_SPINLOCK(purge_lock);
- LIST_HEAD(valist);
+ struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- int nr = 0;
+ bool do_free = false;
- /*
- * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
- * should not expect such behaviour. This just simplifies locking for
- * the case that isn't actually used at the moment anyway.
- */
- if (!sync && !force_flush) {
- if (!spin_trylock(&purge_lock))
- return;
- } else
- spin_lock(&purge_lock);
+ lockdep_assert_held(&vmap_purge_lock);
- if (sync)
- purge_fragmented_blocks_allcpus();
-
- rcu_read_lock();
- list_for_each_entry_rcu(va, &vmap_area_list, list) {
- if (va->flags & VM_LAZY_FREE) {
- if (va->va_start < *start)
- *start = va->va_start;
- if (va->va_end > *end)
- *end = va->va_end;
- nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
- list_add_tail(&va->purge_list, &valist);
- va->flags |= VM_LAZY_FREEING;
- va->flags &= ~VM_LAZY_FREE;
- }
+ valist = llist_del_all(&vmap_purge_list);
+ llist_for_each_entry(va, valist, purge_list) {
+ if (va->va_start < start)
+ start = va->va_start;
+ if (va->va_end > end)
+ end = va->va_end;
+ do_free = true;
}
- rcu_read_unlock();
- if (nr)
- atomic_sub(nr, &vmap_lazy_nr);
+ if (!do_free)
+ return false;
- if (nr || force_flush)
- flush_tlb_kernel_range(*start, *end);
+ flush_tlb_kernel_range(start, end);
- if (nr) {
- spin_lock(&vmap_area_lock);
- list_for_each_entry_safe(va, n_va, &valist, purge_list)
- __free_vmap_area(va);
- spin_unlock(&vmap_area_lock);
+ spin_lock(&vmap_area_lock);
+ llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+
+ __free_vmap_area(va);
+ atomic_sub(nr, &vmap_lazy_nr);
+ cond_resched_lock(&vmap_area_lock);
}
- spin_unlock(&purge_lock);
+ spin_unlock(&vmap_area_lock);
+ return true;
}
/*
@@ -662,9 +697,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
*/
static void try_purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 0, 0);
+ if (mutex_trylock(&vmap_purge_lock)) {
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+ }
}
/*
@@ -672,9 +708,10 @@ static void try_purge_vmap_area_lazy(void)
*/
static void purge_vmap_area_lazy(void)
{
- unsigned long start = ULONG_MAX, end = 0;
-
- __purge_vmap_area_lazy(&start, &end, 1, 0);
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
}
/*
@@ -684,20 +721,16 @@ static void purge_vmap_area_lazy(void)
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
- va->flags |= VM_LAZY_FREE;
- atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
- if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
- try_purge_vmap_area_lazy();
-}
+ int nr_lazy;
-/*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
- * called for the correct range previously.
- */
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
-{
- unmap_vmap_area(va);
- free_vmap_area_noflush(va);
+ nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
+ &vmap_lazy_nr);
+
+ /* After this point, we may free va at any time */
+ llist_add(&va->purge_list, &vmap_purge_list);
+
+ if (unlikely(nr_lazy > lazy_max_pages()))
+ try_purge_vmap_area_lazy();
}
/*
@@ -706,7 +739,8 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- free_unmap_vmap_area_noflush(va);
+ unmap_vmap_area(va);
+ free_vmap_area_noflush(va);
}
static struct vmap_area *find_vmap_area(unsigned long addr)
@@ -720,16 +754,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
return va;
}
-static void free_unmap_vmap_area_addr(unsigned long addr)
-{
- struct vmap_area *va;
-
- va = find_vmap_area(addr);
- BUG_ON(!va);
- free_unmap_vmap_area(va);
-}
-
-
/*** Per cpu kva allocator ***/
/*
@@ -1050,6 +1074,8 @@ void vm_unmap_aliases(void)
if (unlikely(!vmap_initialized))
return;
+ might_sleep();
+
for_each_possible_cpu(cpu) {
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
struct vmap_block *vb;
@@ -1074,7 +1100,11 @@ void vm_unmap_aliases(void)
rcu_read_unlock();
}
- __purge_vmap_area_lazy(&start, &end, 1, flush);
+ mutex_lock(&vmap_purge_lock);
+ purge_fragmented_blocks_allcpus();
+ if (!__purge_vmap_area_lazy(start, end) && flush)
+ flush_tlb_kernel_range(start, end);
+ mutex_unlock(&vmap_purge_lock);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
@@ -1087,7 +1117,9 @@ void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = count << PAGE_SHIFT;
unsigned long addr = (unsigned long)mem;
+ struct vmap_area *va;
+ might_sleep();
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
@@ -1096,10 +1128,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
- if (likely(count <= VMAP_MAX_ALLOC))
+ if (likely(count <= VMAP_MAX_ALLOC)) {
vb_free(mem, size);
- else
- free_unmap_vmap_area_addr(addr);
+ return;
+ }
+
+ va = find_vmap_area(addr);
+ BUG_ON(!va);
+ free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -1148,6 +1184,33 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+
+/**
+ * vm_area_check_early - check if vmap area is already mapped
+ * @vm: vm_struct to be checked
+ *
+ * This function is used to check if the vmap area has been
+ * mapped already. @vm->addr, @vm->size and @vm->flags should
+ * contain proper values.
+ *
+ */
+int __init vm_area_check_early(struct vm_struct *vm)
+{
+ struct vm_struct *tmp, **p;
+
+ BUG_ON(vmap_initialized);
+ for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+ if (tmp->addr >= vm->addr) {
+ if (tmp->addr < vm->addr + vm->size)
+ return 1;
+ } else {
+ if (tmp->addr + tmp->size > vm->addr)
+ return 1;
+ }
+ }
+ return 0;
+}
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1228,7 +1291,7 @@ void __init vmalloc_init(void)
}
vmap_area_pcpu_hole = VMALLOC_END;
-
+ calc_total_vmalloc_size();
vmap_initialized = true;
}
@@ -1392,16 +1455,27 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
+#ifdef CONFIG_ENABLE_VMALLOC_SAVING
+ return __get_vm_area_node(size, 1, flags, PAGE_OFFSET, VMALLOC_END,
+ NUMA_NO_NODE, GFP_KERNEL,
+ __builtin_return_address(0));
+#else
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
+#endif
}
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
+#ifdef CONFIG_ENABLE_VMALLOC_SAVING
+ return __get_vm_area_node(size, 1, flags, PAGE_OFFSET, VMALLOC_END,
+ NUMA_NO_NODE, GFP_KERNEL, caller);
+#else
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL, caller);
+#endif
}
/**
@@ -1435,6 +1509,8 @@ struct vm_struct *remove_vm_area(const void *addr)
{
struct vmap_area *va;
+ might_sleep();
+
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->vm;
@@ -1494,7 +1570,39 @@ static void __vunmap(const void *addr, int deallocate_pages)
kfree(area);
return;
}
-
+
+static inline void __vfree_deferred(const void *addr)
+{
+ /*
+ * Use raw_cpu_ptr() because this can be called from preemptible
+ * context. Preemption is absolutely fine here, because the llist_add()
+ * implementation is lockless, so it works even if we are adding to
+ * nother cpu's list. schedule_work() should be fine with this too.
+ */
+ struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
+
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
+}
+
+/**
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
+ *
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
+ */
+void vfree_atomic(const void *addr)
+{
+ BUG_ON(in_nmi());
+
+ kmemleak_free(addr);
+
+ if (!addr)
+ return;
+ __vfree_deferred(addr);
+}
+
/**
* vfree - release memory allocated by vmalloc()
* @addr: memory base address
@@ -1517,11 +1625,9 @@ void vfree(const void *addr)
if (!addr)
return;
- if (unlikely(in_interrupt())) {
- struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
- if (llist_add((struct llist_node *)addr, &p->list))
- schedule_work(&p->wq);
- } else
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
__vunmap(addr, 1);
}
EXPORT_SYMBOL(vfree);
@@ -2671,6 +2777,9 @@ static int s_show(struct seq_file *m, void *p)
if (v->flags & VM_VPAGES)
seq_puts(m, " vpages");
+ if (v->flags & VM_LOWMEM)
+ seq_puts(m, " lowmem");
+
show_numa_info(m, v);
seq_putc(m, '\n');
return 0;
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 3fb15c25af87..f5383e43597a 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -22,6 +22,9 @@
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/printk.h>
+#include <linux/notifier.h>
+#include <linux/init.h>
+#include <linux/module.h>
#include <linux/vmpressure.h>
/*
@@ -38,7 +41,7 @@
* TODO: Make the window size depend on machine size, as we do for vmstat
* thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
*/
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+static unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
/*
* These thresholds are used when we account memory pressure through
@@ -49,6 +52,33 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
static const unsigned int vmpressure_level_med = 60;
static const unsigned int vmpressure_level_critical = 95;
+static unsigned long vmpressure_scale_max = 100;
+module_param_named(vmpressure_scale_max, vmpressure_scale_max,
+ ulong, S_IRUGO | S_IWUSR);
+
+/* vmpressure values >= this will be scaled based on allocstalls */
+static unsigned long allocstall_threshold = 70;
+module_param_named(allocstall_threshold, allocstall_threshold,
+ ulong, S_IRUGO | S_IWUSR);
+
+static struct vmpressure global_vmpressure;
+BLOCKING_NOTIFIER_HEAD(vmpressure_notifier);
+
+int vmpressure_notifier_register(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&vmpressure_notifier, nb);
+}
+
+int vmpressure_notifier_unregister(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&vmpressure_notifier, nb);
+}
+
+void vmpressure_notify(unsigned long pressure)
+{
+ blocking_notifier_call_chain(&vmpressure_notifier, pressure, NULL);
+}
+
/*
* When there are too little pages left to scan, vmpressure() may miss the
* critical pressure as number of pages will be less than "window size".
@@ -75,6 +105,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work)
return container_of(work, struct vmpressure, work);
}
+#ifdef CONFIG_MEMCG
static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
{
struct cgroup_subsys_state *css = vmpressure_to_css(vmpr);
@@ -85,6 +116,12 @@ static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
return NULL;
return memcg_to_vmpressure(memcg);
}
+#else
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+ return NULL;
+}
+#endif
enum vmpressure_levels {
VMPRESSURE_LOW = 0,
@@ -108,7 +145,7 @@ static enum vmpressure_levels vmpressure_level(unsigned long pressure)
return VMPRESSURE_LOW;
}
-static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+static unsigned long vmpressure_calc_pressure(unsigned long scanned,
unsigned long reclaimed)
{
unsigned long scale = scanned + reclaimed;
@@ -135,7 +172,20 @@ out:
pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure,
scanned, reclaimed);
- return vmpressure_level(pressure);
+ return pressure;
+}
+
+static unsigned long vmpressure_account_stall(unsigned long pressure,
+ unsigned long stall, unsigned long scanned)
+{
+ unsigned long scale;
+
+ if (pressure < allocstall_threshold)
+ return pressure;
+
+ scale = ((vmpressure_scale_max - pressure) * stall) / scanned;
+
+ return pressure + scale;
}
struct vmpressure_event {
@@ -149,9 +199,11 @@ static bool vmpressure_event(struct vmpressure *vmpr,
{
struct vmpressure_event *ev;
enum vmpressure_levels level;
+ unsigned long pressure;
bool signalled = false;
- level = vmpressure_calc_level(scanned, reclaimed);
+ pressure = vmpressure_calc_pressure(scanned, reclaimed);
+ level = vmpressure_level(pressure);
mutex_lock(&vmpr->events_lock);
@@ -203,24 +255,13 @@ static void vmpressure_work_fn(struct work_struct *work)
} while ((vmpr = vmpressure_parent(vmpr)));
}
-/**
- * vmpressure() - Account memory pressure through scanned/reclaimed ratio
- * @gfp: reclaimer's gfp mask
- * @memcg: cgroup memory controller handle
- * @scanned: number of pages scanned
- * @reclaimed: number of pages reclaimed
- *
- * This function should be called from the vmscan reclaim path to account
- * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
- * pressure index is then further refined and averaged over time.
- *
- * This function does not return any value.
- */
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure_memcg(gfp_t gfp, struct mem_cgroup *memcg,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+ BUG_ON(!vmpr);
+
/*
* Here we only want to account pressure that userland is able to
* help us with. For example, suppose that DMA zone is under
@@ -257,6 +298,94 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
schedule_work(&vmpr->work);
}
+void calculate_vmpressure_win(void)
+{
+ long x;
+
+ x = global_page_state(NR_FILE_PAGES) -
+ global_page_state(NR_SHMEM) -
+ total_swapcache_pages() +
+ global_page_state(NR_FREE_PAGES);
+ if (x < 1)
+ x = 1;
+ /*
+ * For low (free + cached), vmpressure window should be
+ * small, and high for higher values of (free + cached).
+ * But it should not be linear as well. This ensures
+ * timely vmpressure notifications when system is under
+ * memory pressure, and optimal number of events when
+ * cached is high. The sqaure root function is empirically
+ * found to serve the purpose.
+ */
+ x = int_sqrt(x);
+ vmpressure_win = x;
+}
+
+void vmpressure_global(gfp_t gfp, unsigned long scanned,
+ unsigned long reclaimed)
+{
+ struct vmpressure *vmpr = &global_vmpressure;
+ unsigned long pressure;
+ unsigned long stall;
+
+ if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+ return;
+
+ if (!scanned)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ if (!vmpr->scanned)
+ calculate_vmpressure_win();
+
+ vmpr->scanned += scanned;
+ vmpr->reclaimed += reclaimed;
+
+ if (!current_is_kswapd())
+ vmpr->stall += scanned;
+
+ stall = vmpr->stall;
+ scanned = vmpr->scanned;
+ reclaimed = vmpr->reclaimed;
+ spin_unlock(&vmpr->sr_lock);
+
+ if (scanned < vmpressure_win)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ vmpr->scanned = 0;
+ vmpr->reclaimed = 0;
+ vmpr->stall = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ pressure = vmpressure_calc_pressure(scanned, reclaimed);
+ pressure = vmpressure_account_stall(pressure, stall, scanned);
+ vmpressure_notify(pressure);
+}
+
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp: reclaimer's gfp mask
+ * @memcg: cgroup memory controller handle
+ * @scanned: number of pages scanned
+ * @reclaimed: number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+ unsigned long scanned, unsigned long reclaimed)
+{
+ if (!memcg)
+ vmpressure_global(gfp, scanned, reclaimed);
+
+ if (IS_ENABLED(CONFIG_MEMCG))
+ vmpressure_memcg(gfp, memcg, scanned, reclaimed);
+}
+
/**
* vmpressure_prio() - Account memory pressure through reclaimer priority level
* @gfp: reclaimer's gfp mask
@@ -308,6 +437,8 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
struct vmpressure_event *ev;
int level;
+ BUG_ON(!vmpr);
+
for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
if (!strcmp(vmpressure_str_levels[level], args))
break;
@@ -347,6 +478,8 @@ void vmpressure_unregister_event(struct mem_cgroup *memcg,
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
+ BUG_ON(!vmpr);
+
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
if (ev->efd != eventfd)
@@ -388,3 +521,10 @@ void vmpressure_cleanup(struct vmpressure *vmpr)
*/
flush_work(&vmpr->work);
}
+
+int vmpressure_global_init(void)
+{
+ vmpressure_init(&global_vmpressure);
+ return 0;
+}
+late_initcall(vmpressure_global_init);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 76853088f66b..aa1074d3031b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -104,6 +104,13 @@ struct scan_control {
/* Number of pages freed so far during a call to shrink_zones() */
unsigned long nr_reclaimed;
+
+ /*
+ * Reclaim pages from a vma. If the page is shared by other tasks
+ * it is zapped from a vma without reclaim so it ends up remaining
+ * on memory until last task zap it.
+ */
+ struct vm_area_struct *target_vma;
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -146,6 +153,12 @@ int vm_swappiness = 60;
*/
unsigned long vm_total_pages;
+#ifdef CONFIG_KSWAPD_CPU_AFFINITY_MASK
+char *kswapd_cpu_mask = CONFIG_KSWAPD_CPU_AFFINITY_MASK;
+#else
+char *kswapd_cpu_mask = NULL;
+#endif
+
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
@@ -281,6 +294,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
long scanned = 0, next_deferred;
+ long min_cache_size = batch_size;
+
+ if (current_is_kswapd())
+ min_cache_size = 0;
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0)
@@ -348,7 +365,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
* scanning at high prio and therefore should try to reclaim as much as
* possible.
*/
- while (total_scan >= batch_size ||
+ while (total_scan > min_cache_size ||
total_scan >= freeable) {
unsigned long ret;
unsigned long nr_to_scan = min(batch_size, total_scan);
@@ -385,6 +402,35 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
return freed;
}
+static void shrink_slab_lmk(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
+{
+ struct shrinker *shrinker;
+
+ if (nr_scanned == 0)
+ nr_scanned = SWAP_CLUSTER_MAX;
+
+ if (!down_read_trylock(&shrinker_rwsem))
+ goto out;
+
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ struct shrink_control sc = {
+ .gfp_mask = gfp_mask,
+ };
+
+ if (!(shrinker->flags & SHRINKER_LMK))
+ continue;
+
+ do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+ }
+
+ up_read(&shrinker_rwsem);
+out:
+ cond_resched();
+}
+
/**
* shrink_slab - shrink slab caches
* @gfp_mask: allocation context
@@ -446,6 +492,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
+ if (shrinker->flags & SHRINKER_LMK)
+ continue;
+
if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
continue;
@@ -915,7 +964,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback;
cond_resched();
@@ -927,7 +976,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON_PAGE(PageActive(page), page);
- VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+ if (zone)
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
sc->nr_scanned++;
@@ -1006,7 +1056,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
- test_bit(ZONE_WRITEBACK, &zone->flags)) {
+ (zone && test_bit(ZONE_WRITEBACK, &zone->flags))) {
nr_immediate++;
goto keep_locked;
@@ -1072,7 +1122,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page,
- ttu_flags|TTU_BATCH_FLUSH)) {
+ ttu_flags|TTU_BATCH_FLUSH,
+ sc->target_vma)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
@@ -1092,7 +1143,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (page_is_file_cache(page) &&
(!current_is_kswapd() ||
- !test_bit(ZONE_DIRTY, &zone->flags))) {
+ (zone &&
+ !test_bit(ZONE_DIRTY, &zone->flags)))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
@@ -1195,7 +1247,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
nr_reclaimed++;
@@ -1204,6 +1256,13 @@ free_it:
* appear not as the counts should be low
*/
list_add(&page->lru, &free_pages);
+ /*
+ * If pagelist are from multiple zones, we should decrease
+ * NR_ISOLATED_ANON + x on freed pages in here.
+ */
+ if (!zone)
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
continue;
cull_mlocked:
@@ -1215,7 +1274,7 @@ cull_mlocked:
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && vm_swap_full())
+ if (PageSwapCache(page) && vm_swap_full(page_swap_info(page)))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
@@ -1249,6 +1308,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.gfp_mask = GFP_KERNEL,
.priority = DEF_PRIORITY,
.may_unmap = 1,
+ /* Doesn't allow to write out dirty page */
+ .may_writepage = 0,
};
unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
struct page *page, *next;
@@ -1256,7 +1317,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !isolated_balloon_page(page)) {
+ !__PageMovable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -1270,6 +1331,42 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
return ret;
}
+#ifdef CONFIG_PROCESS_RECLAIM
+unsigned long reclaim_pages_from_list(struct list_head *page_list,
+ struct vm_area_struct *vma)
+{
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .target_vma = vma,
+ };
+
+ unsigned long nr_reclaimed;
+ struct page *page;
+ unsigned long dummy1, dummy2, dummy3, dummy4, dummy5;
+
+ list_for_each_entry(page, page_list, lru)
+ ClearPageActive(page);
+
+ nr_reclaimed = shrink_page_list(page_list, NULL, &sc,
+ TTU_UNMAP|TTU_IGNORE_ACCESS,
+ &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
+
+ while (!list_empty(page_list)) {
+ page = lru_to_page(page_list);
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+
+ return nr_reclaimed;
+}
+#endif
+
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
@@ -1466,6 +1563,44 @@ int isolate_lru_page(struct page *page)
return ret;
}
+static int __too_many_isolated(struct zone *zone, int file,
+ struct scan_control *sc, int safe)
+{
+ unsigned long inactive, isolated;
+
+ if (file) {
+ if (safe) {
+ inactive = zone_page_state_snapshot(zone,
+ NR_INACTIVE_FILE);
+ isolated = zone_page_state_snapshot(zone,
+ NR_ISOLATED_FILE);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+ isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+ }
+ } else {
+ if (safe) {
+ inactive = zone_page_state_snapshot(zone,
+ NR_INACTIVE_ANON);
+ isolated = zone_page_state_snapshot(zone,
+ NR_ISOLATED_ANON);
+ } else {
+ inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+ isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+ }
+ }
+
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
+ inactive >>= 3;
+
+ return isolated > inactive;
+}
+
/*
* A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
* then get resheduled. When there are massive number of tasks doing page
@@ -1474,33 +1609,22 @@ int isolate_lru_page(struct page *page)
* unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
- struct scan_control *sc)
+ struct scan_control *sc, int safe)
{
- unsigned long inactive, isolated;
-
if (current_is_kswapd())
return 0;
if (!sane_reclaim(sc))
return 0;
- if (file) {
- inactive = zone_page_state(zone, NR_INACTIVE_FILE);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE);
- } else {
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+ if (unlikely(__too_many_isolated(zone, file, sc, 0))) {
+ if (safe)
+ return __too_many_isolated(zone, file, sc, safe);
+ else
+ return 1;
}
- /*
- * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
- * won't get blocked by normal direct-reclaimers, forming a circular
- * deadlock.
- */
- if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
- inactive >>= 3;
-
- return isolated > inactive;
+ return 0;
}
static noinline_for_stack void
@@ -1516,6 +1640,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
while (!list_empty(page_list)) {
struct page *page = lru_to_page(page_list);
int lru;
+ int file;
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
@@ -1532,8 +1657,11 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
lru = page_lru(page);
add_page_to_lru_list(page, lruvec, lru);
+ file = is_file_lru(lru);
+ if (IS_ENABLED(CONFIG_ZCACHE))
+ if (file)
+ SetPageWasActive(page);
if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
int numpages = hpage_nr_pages(page);
reclaim_stat->recent_rotated[file] += numpages;
}
@@ -1590,15 +1718,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_immediate = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
+ int safe = 0;
struct zone *zone = lruvec_zone(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- while (unlikely(too_many_isolated(zone, file, sc))) {
+ while (unlikely(too_many_isolated(zone, file, sc, safe))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
return SWAP_CLUSTER_MAX;
+
+ safe = 1;
}
lru_add_drain();
@@ -1855,6 +1986,12 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
ClearPageActive(page); /* we are de-activating */
+ if (IS_ENABLED(CONFIG_ZCACHE))
+ /*
+ * For zcache to know whether the page is from active
+ * file list
+ */
+ SetPageWasActive(page);
list_add(&page->lru, &l_inactive);
}
@@ -2075,8 +2212,9 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_file_is_low(lruvec) &&
- get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) &&
+ !inactive_file_is_low(lruvec) &&
+ get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2449,15 +2587,23 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
sc->nr_scanned - nr_scanned,
zone_lru_pages);
+ /*
+ * Record the subtree's reclaim efficiency. The reclaimed
+ * pages from slab is excluded here because the corresponding
+ * scanned pages is not accounted. Moreover, freeing a page
+ * by slab shrinking depends on each slab's object population,
+ * making the cost model (i.e. scan:free) different from that
+ * of LRU.
+ */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+ sc->nr_scanned - nr_scanned,
+ sc->nr_reclaimed - nr_reclaimed);
+
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
- sc->nr_scanned - nr_scanned,
- sc->nr_reclaimed - nr_reclaimed);
-
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
@@ -2531,6 +2677,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
gfp_t orig_mask;
enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
bool reclaimable = false;
+ unsigned long lru_pages = 0;
/*
* If the number of buffer_heads in the machine exceeds the maximum
@@ -2558,6 +2705,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* to global LRU.
*/
if (global_reclaim(sc)) {
+ lru_pages += zone_reclaimable_pages(zone);
if (!cpuset_zone_allowed(zone,
GFP_KERNEL | __GFP_HARDWALL))
continue;
@@ -2608,6 +2756,9 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
reclaimable = true;
}
+ if (global_reclaim(sc))
+ shrink_slab_lmk(sc->gfp_mask, 0, NULL,
+ sc->nr_scanned, lru_pages);
/*
* Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM.
@@ -2966,18 +3117,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order,
- unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
+ unsigned long balance_gap, int classzone_idx)
{
- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx))
- return false;
+ unsigned long mark = high_wmark_pages(zone) + balance_gap;
- if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
- order, 0, classzone_idx) == COMPACT_SKIPPED)
- return false;
+ /*
+ * When checking from pgdat_balanced(), kswapd should stop and sleep
+ * when it reaches the high order-0 watermark and let kcompactd take
+ * over. Other callers such as wakeup_kswapd() want to determine the
+ * true high-order watermark.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+ mark += (1UL << order);
+ order = 0;
+ }
- return true;
+ return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
}
/*
@@ -3027,7 +3183,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
continue;
}
- if (zone_balanced(zone, order, 0, i))
+ if (zone_balanced(zone, order, false, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
@@ -3082,9 +3238,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
struct scan_control *sc,
- unsigned long *nr_attempted)
+ unsigned long lru_pages)
{
- int testorder = sc->order;
unsigned long balance_gap;
bool lowmem_pressure;
@@ -3092,17 +3247,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
/*
- * Kswapd reclaims only single pages with compaction enabled. Trying
- * too hard to reclaim until contiguous free pages have become
- * available can hurt performance by evicting too much useful data
- * from memory. Do not reclaim more than needed for compaction.
- */
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
- compaction_suitable(zone, sc->order, 0, classzone_idx)
- != COMPACT_SKIPPED)
- testorder = 0;
-
- /*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
@@ -3116,14 +3260,13 @@ static bool kswapd_shrink_zone(struct zone *zone,
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, testorder,
+ if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
balance_gap, classzone_idx))
return true;
shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
-
- /* Account for the number of pages attempted to reclaim */
- *nr_attempted += sc->nr_to_reclaim;
+ shrink_slab_lmk(sc->gfp_mask, zone_to_nid(zone), NULL,
+ sc->nr_scanned, lru_pages);
clear_bit(ZONE_WRITEBACK, &zone->flags);
@@ -3134,7 +3277,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* waits.
*/
if (zone_reclaimable(zone) &&
- zone_balanced(zone, testorder, 0, classzone_idx)) {
+ zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
@@ -3146,7 +3289,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -3163,8 +3306,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int *classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -3181,9 +3323,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
count_vm_event(PAGEOUTRUN);
do {
- unsigned long nr_attempted = 0;
bool raise_priority = true;
- bool pgdat_needs_compaction = (order > 0);
+ unsigned long lru_pages = 0;
sc.nr_reclaimed = 0;
@@ -3218,7 +3359,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
break;
}
- if (!zone_balanced(zone, order, 0, 0)) {
+ if (!zone_balanced(zone, order, false, 0, 0)) {
end_zone = i;
break;
} else {
@@ -3234,32 +3375,23 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (i < 0)
goto out;
+ /*
+ * If we're getting trouble reclaiming, start doing writepage
+ * even in laptop mode.
+ */
+ if (sc.priority < DEF_PRIORITY - 2)
+ sc.may_writepage = 1;
+
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- /*
- * If any zone is currently balanced then kswapd will
- * not call compaction as it is expected that the
- * necessary pages are already available.
- */
- if (pgdat_needs_compaction &&
- zone_watermark_ok(zone, order,
- low_wmark_pages(zone),
- *classzone_idx, 0))
- pgdat_needs_compaction = false;
+ lru_pages += zone_reclaimable_pages(zone);
}
/*
- * If we're getting trouble reclaiming, start doing writepage
- * even in laptop mode.
- */
- if (sc.priority < DEF_PRIORITY - 2)
- sc.may_writepage = 1;
-
- /*
* Now scan the zone in the dma->highmem direction, stopping
* at the last zone which needs scanning.
*
@@ -3295,8 +3427,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* that that high watermark would be met at 100%
* efficiency.
*/
- if (kswapd_shrink_zone(zone, end_zone,
- &sc, &nr_attempted))
+ if (kswapd_shrink_zone(zone, end_zone, &sc, lru_pages))
raise_priority = false;
}
@@ -3309,49 +3440,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
- /*
- * Fragmentation may mean that the system cannot be rebalanced
- * for high-order allocations in all zones. If twice the
- * allocation size has been reclaimed and the zones are still
- * not balanced then recheck the watermarks at order-0 to
- * prevent kswapd reclaiming excessively. Assume that a
- * process requested a high-order can direct reclaim/compact.
- */
- if (order && sc.nr_reclaimed >= 2UL << order)
- order = sc.order = 0;
-
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
- * Compact if necessary and kswapd is reclaiming at least the
- * high watermark number of pages as requsted
- */
- if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
- compact_pgdat(pgdat, order);
-
- /*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, *classzone_idx));
+ !pgdat_balanced(pgdat, order, classzone_idx));
out:
/*
- * Return the order we were reclaiming at so prepare_kswapd_sleep()
- * makes a decision on the order we were last reclaiming at. However,
- * if another caller entered the allocator slow path while kswapd
- * was awake, order will remain at the higher level
+ * Return the highest zone idx we were reclaiming at so
+ * prepare_kswapd_sleep() makes the same decisions as here.
*/
- *classzone_idx = end_zone;
- return order;
+ return end_zone;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+ int classzone_idx, int balanced_classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3362,7 +3473,22 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
+ /*
+ * Compaction records what page blocks it recently failed to
+ * isolate pages from and skips them in the future scanning.
+ * When kswapd is going to sleep, it is reasonable to assume
+ * that pages and compaction may succeed so reset the cache.
+ */
+ reset_isolation_suitable(pgdat);
+
+ /*
+ * We have freed the memory, now we should compact it to make
+ * allocation of the requested order possible.
+ */
+ wakeup_kcompactd(pgdat, order, classzone_idx);
+
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3372,7 +3498,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3385,14 +3512,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- /*
- * Compaction records what page blocks it recently failed to
- * isolate pages from and skips them in the future scanning.
- * When kswapd is going to sleep, it is reasonable to assume
- * that pages and compaction may succeed so reset the cache.
- */
- reset_isolation_suitable(pgdat);
-
if (!kthread_should_stop())
schedule();
@@ -3422,7 +3541,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
- unsigned balanced_order;
int classzone_idx, new_classzone_idx;
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
@@ -3435,7 +3553,7 @@ static int kswapd(void *p)
lockdep_set_current_reclaim_state(GFP_KERNEL);
- if (!cpumask_empty(cpumask))
+ if (kswapd_cpu_mask == NULL && !cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
current->reclaim_state = &reclaim_state;
@@ -3455,24 +3573,19 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
- balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
bool ret;
/*
- * If the last balance_pgdat was unsuccessful it's unlikely a
- * new request of a similar or harder type will succeed soon
- * so consider going to sleep on the basis we reclaimed at
+ * While we were reclaiming, there might have been another
+ * wakeup, so check the values.
*/
- if (balanced_classzone_idx >= new_classzone_idx &&
- balanced_order == new_order) {
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = pgdat->nr_zones - 1;
- }
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
@@ -3482,7 +3595,7 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, balanced_order,
+ kswapd_try_to_sleep(pgdat, order, classzone_idx,
balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
@@ -3502,9 +3615,8 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balanced_classzone_idx = classzone_idx;
- balanced_order = balance_pgdat(pgdat, order,
- &balanced_classzone_idx);
+ balanced_classzone_idx = balance_pgdat(pgdat, order,
+ classzone_idx);
}
}
@@ -3534,7 +3646,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, 0, 0))
+ if (zone_balanced(zone, order, true, 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
@@ -3605,6 +3717,22 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action,
return NOTIFY_OK;
}
+static int set_kswapd_cpu_mask(pg_data_t *pgdat)
+{
+ int ret = 0;
+ cpumask_t tmask;
+
+ if (!kswapd_cpu_mask)
+ return 0;
+
+ cpumask_clear(&tmask);
+ ret = cpumask_parse(kswapd_cpu_mask, &tmask);
+ if (ret)
+ return ret;
+
+ return set_cpus_allowed_ptr(pgdat->kswapd, &tmask);
+}
+
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
@@ -3624,6 +3752,9 @@ int kswapd_run(int nid)
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
+ } else if (kswapd_cpu_mask) {
+ if (set_kswapd_cpu_mask(pgdat))
+ pr_warn("error setting kswapd cpu affinity mask\n");
}
return ret;
}
@@ -3649,7 +3780,8 @@ static int __init kswapd_init(void)
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
- hotcpu_notifier(cpu_callback, 0);
+ if (kswapd_cpu_mask == NULL)
+ hotcpu_notifier(cpu_callback, 0);
return 0;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9d8936c7b40d..3a07628297da 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -764,6 +764,8 @@ const char * const vmstat_text[] = {
"workingset_nodereclaim",
"nr_anon_transparent_hugepages",
"nr_free_cma",
+ "nr_swapcache",
+ "nr_indirectly_reclaimable",
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -773,6 +775,7 @@ const char * const vmstat_text[] = {
/* enum vm_event_item counters */
"pgpgin",
"pgpgout",
+ "pgpgoutclean",
"pswpin",
"pswpout",
@@ -826,6 +829,7 @@ const char * const vmstat_text[] = {
"compact_stall",
"compact_fail",
"compact_success",
+ "compact_daemon_wake",
#endif
#ifdef CONFIG_HUGETLB_PAGE
@@ -901,6 +905,7 @@ static void frag_stop(struct seq_file *m, void *arg)
/* Walk all the zones in a node and print using a callback */
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ bool nolock,
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
{
struct zone *zone;
@@ -911,27 +916,16 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
if (!populated_zone(zone))
continue;
- spin_lock_irqsave(&zone->lock, flags);
+ if (!nolock)
+ spin_lock_irqsave(&zone->lock, flags);
print(m, pgdat, zone);
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (!nolock)
+ spin_unlock_irqrestore(&zone->lock, flags);
}
}
#endif
#ifdef CONFIG_PROC_FS
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Movable",
- "Reclaimable",
- "HighAtomic",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -949,7 +943,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, frag_show_print);
+ walk_zones_in_node(m, pgdat, false, frag_show_print);
return 0;
}
@@ -990,7 +984,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+ walk_zones_in_node(m, pgdat, false, pagetypeinfo_showfree_print);
return 0;
}
@@ -1039,7 +1033,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+ walk_zones_in_node(m, pgdat, false, pagetypeinfo_showblockcount_print);
return 0;
}
@@ -1083,7 +1077,11 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
page = pfn_to_page(pfn);
if (PageBuddy(page)) {
- pfn += (1UL << page_order(page)) - 1;
+ unsigned long freepage_order;
+
+ freepage_order = page_order_unsafe(page);
+ if (freepage_order < MAX_ORDER)
+ pfn += (1UL << freepage_order) - 1;
continue;
}
@@ -1130,7 +1128,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
#ifdef CONFIG_PAGE_OWNER
int mtype;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return;
drain_all_pages(NULL);
@@ -1140,7 +1138,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
#endif /* CONFIG_PAGE_OWNER */
}
@@ -1273,7 +1271,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, zoneinfo_show_print);
+ walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
return 0;
}
@@ -1390,7 +1388,7 @@ static cpumask_var_t cpu_stat_off;
static void vmstat_update(struct work_struct *w)
{
- if (refresh_cpu_vm_stats(true)) {
+ if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) {
/*
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
@@ -1407,7 +1405,8 @@ static void vmstat_update(struct work_struct *w)
} else {
/*
* We did not update any counters so the app may be in
- * a mode where it does not cause counter updates.
+ * a mode where it does not cause counter updates or the cpu
+ * was isolated.
* We may be uselessly running vmstat_update.
* Defer the checking for differentials to the
* shepherd thread on a different processor.
@@ -1488,7 +1487,7 @@ static void vmstat_shepherd(struct work_struct *w)
for_each_cpu(cpu, cpu_stat_off) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
- if (need_update(cpu)) {
+ if (!cpu_isolated(cpu) && need_update(cpu)) {
if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
} else {
@@ -1657,7 +1656,7 @@ static int unusable_show(struct seq_file *m, void *arg)
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
- walk_zones_in_node(m, pgdat, unusable_show_print);
+ walk_zones_in_node(m, pgdat, false, unusable_show_print);
return 0;
}
@@ -1709,7 +1708,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, extfrag_show_print);
+ walk_zones_in_node(m, pgdat, false, extfrag_show_print);
return 0;
}
diff --git a/mm/zbud.c b/mm/zbud.c
index d8a181fd779b..09ab957e2b10 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -357,13 +357,15 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
struct zbud_header *zhdr = NULL;
enum buddy bud;
struct page *page;
+ unsigned long flags;
+ int found = 0;
if (!size || (gfp & __GFP_HIGHMEM))
return -EINVAL;
if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
return -ENOSPC;
chunks = size_to_chunks(size);
- spin_lock(&pool->lock);
+ spin_lock_irqsave(&pool->lock, flags);
/* First, try to find an unbuddied zbud page. */
zhdr = NULL;
@@ -376,16 +378,17 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
bud = FIRST;
else
bud = LAST;
+ found = 1;
goto found;
}
}
/* Couldn't find unbuddied zbud page, create new one */
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
page = alloc_page(gfp);
if (!page)
return -ENOMEM;
- spin_lock(&pool->lock);
+ spin_lock_irqsave(&pool->lock, flags);
pool->pages_nr++;
zhdr = init_zbud_page(page);
bud = FIRST;
@@ -411,7 +414,9 @@ found:
list_add(&zhdr->lru, &pool->lru);
*handle = encode_handle(zhdr, bud);
- spin_unlock(&pool->lock);
+ if ((gfp & __GFP_ZERO) && found)
+ memset((void *)*handle, 0, size);
+ spin_unlock_irqrestore(&pool->lock, flags);
return 0;
}
@@ -430,8 +435,9 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
{
struct zbud_header *zhdr;
int freechunks;
+ unsigned long flags;
- spin_lock(&pool->lock);
+ spin_lock_irqsave(&pool->lock, flags);
zhdr = handle_to_zbud_header(handle);
/* If first buddy, handle will be page aligned */
@@ -442,7 +448,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
if (zhdr->under_reclaim) {
/* zbud page is under reclaim, reclaim will free */
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
return;
}
@@ -460,7 +466,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
}
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
}
#define list_tail_entry(ptr, type, member) \
@@ -505,12 +511,13 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
{
int i, ret, freechunks;
struct zbud_header *zhdr;
+ unsigned long flags;
unsigned long first_handle = 0, last_handle = 0;
- spin_lock(&pool->lock);
+ spin_lock_irqsave(&pool->lock, flags);
if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
retries == 0) {
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
return -EINVAL;
}
for (i = 0; i < retries; i++) {
@@ -529,7 +536,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
first_handle = encode_handle(zhdr, FIRST);
if (zhdr->last_chunks)
last_handle = encode_handle(zhdr, LAST);
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
/* Issue the eviction callback(s) */
if (first_handle) {
@@ -543,7 +550,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
goto next;
}
next:
- spin_lock(&pool->lock);
+ spin_lock_irqsave(&pool->lock, flags);
zhdr->under_reclaim = false;
if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
/*
@@ -552,7 +559,7 @@ next:
*/
free_zbud_page(zhdr);
pool->pages_nr--;
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
return 0;
} else if (zhdr->first_chunks == 0 ||
zhdr->last_chunks == 0) {
@@ -567,7 +574,7 @@ next:
/* add to beginning of LRU */
list_add(&zhdr->lru, &pool->lru);
}
- spin_unlock(&pool->lock);
+ spin_unlock_irqrestore(&pool->lock, flags);
return -EAGAIN;
}
diff --git a/mm/zcache.c b/mm/zcache.c
new file mode 100644
index 000000000000..01473566ed0b
--- /dev/null
+++ b/mm/zcache.c
@@ -0,0 +1,1169 @@
+/*
+ * linux/mm/zcache.c
+ *
+ * A cleancache backend for file pages compression.
+ * Concepts based on original zcache by Dan Magenheimer.
+ * Copyright (C) 2013 Bob Liu <bob.liu@xxxxxxxxxx>
+ *
+ * With zcache, active file pages can be compressed in memory during page
+ * reclaiming. When their data is needed again the I/O reading operation is
+ * avoided. This results in a significant performance gain under memory pressure
+ * for systems with many file pages.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/cleancache.h>
+#include <linux/cpu.h>
+#include <linux/crypto.h>
+#include <linux/page-flags.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/mm_types.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/types.h>
+#include <linux/zbud.h>
+
+/*
+ * Enable/disable zcache (disabled by default)
+ */
+static bool zcache_enabled __read_mostly;
+module_param_named(enabled, zcache_enabled, bool, 0);
+
+/*
+ * Compressor to be used by zcache
+ */
+#define ZCACHE_COMPRESSOR_DEFAULT "lzo"
+static char *zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT;
+module_param_named(compressor, zcache_compressor, charp, 0);
+
+/*
+ * The maximum percentage of memory that the compressed pool can occupy.
+ */
+static unsigned int zcache_max_pool_percent = 10;
+module_param_named(max_pool_percent, zcache_max_pool_percent, uint, 0644);
+
+static unsigned int zcache_clear_percent = 4;
+module_param_named(clear_percent, zcache_clear_percent, uint, 0644);
+/*
+ * zcache statistics
+ */
+static u64 zcache_pool_limit_hit;
+static u64 zcache_dup_entry;
+static u64 zcache_zbud_alloc_fail;
+static u64 zcache_evict_zpages;
+static u64 zcache_evict_filepages;
+static u64 zcache_inactive_pages_refused;
+static u64 zcache_reclaim_fail;
+static u64 zcache_pool_shrink;
+static u64 zcache_pool_shrink_fail;
+static u64 zcache_pool_shrink_pages;
+static u64 zcache_store_failed;
+static atomic_t zcache_stored_pages = ATOMIC_INIT(0);
+static atomic_t zcache_stored_zero_pages = ATOMIC_INIT(0);
+
+#define GFP_ZCACHE \
+ (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | \
+ __GFP_NOMEMALLOC | __GFP_ZERO)
+
+/*
+ * Make sure this is different from radix tree
+ * indirect ptr or exceptional entry.
+ */
+#define ZERO_HANDLE ((void *)~(~0UL >> 1))
+
+/*
+ * Zcache receives pages for compression through the Cleancache API and is able
+ * to evict pages from its own compressed pool on an LRU basis in the case that
+ * the compressed pool is full.
+ *
+ * Zcache makes use of zbud for the managing the compressed memory pool. Each
+ * allocation in zbud is not directly accessible by address. Rather, a handle
+ * (zaddr) is return by the allocation routine and that handle(zaddr must be
+ * mapped before being accessed. The compressed memory pool grows on demand and
+ * shrinks as compressed pages are freed.
+ *
+ * When a file page is passed from cleancache to zcache, zcache maintains a
+ * mapping of the <filesystem_type, inode_number, page_index> to the zbud
+ * address that references that compressed file page. This mapping is achieved
+ * with a red-black tree per filesystem type, plus a radix tree per red-black
+ * node.
+ *
+ * A zcache pool with pool_id as the index is created when a filesystem mounted
+ * Each zcache pool has a red-black tree, the inode number(rb_index) is the
+ * search key. Each red-black tree node has a radix tree which use
+ * page->index(ra_index) as the index. Each radix tree slot points to the zbud
+ * address combining with some extra information(zcache_ra_handle).
+ */
+#define MAX_ZCACHE_POOLS 32
+/*
+ * One zcache_pool per (cleancache aware) filesystem mount instance
+ */
+struct zcache_pool {
+ struct rb_root rbtree;
+ rwlock_t rb_lock; /* Protects rbtree */
+ u64 size;
+ struct zbud_pool *pool; /* Zbud pool used */
+};
+
+/*
+ * Manage all zcache pools
+ */
+struct _zcache {
+ struct zcache_pool *pools[MAX_ZCACHE_POOLS];
+ u32 num_pools; /* Current no. of zcache pools */
+ spinlock_t pool_lock; /* Protects pools[] and num_pools */
+};
+struct _zcache zcache;
+
+/*
+ * Redblack tree node, each node has a page index radix-tree.
+ * Indexed by inode nubmer.
+ */
+struct zcache_rbnode {
+ struct rb_node rb_node;
+ int rb_index;
+ struct radix_tree_root ratree; /* Page radix tree per inode rbtree */
+ spinlock_t ra_lock; /* Protects radix tree */
+ struct kref refcount;
+};
+
+/*
+ * Radix-tree leaf, indexed by page->index
+ */
+struct zcache_ra_handle {
+ int rb_index; /* Redblack tree index */
+ int ra_index; /* Radix tree index */
+ int zlen; /* Compressed page size */
+ struct zcache_pool *zpool; /* Finding zcache_pool during evict */
+};
+
+u64 zcache_pages(void)
+{
+ int i;
+ u64 count = 0;
+
+ for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++)
+ count += zcache.pools[i]->size;
+
+ return count;
+}
+
+static struct kmem_cache *zcache_rbnode_cache;
+static int zcache_rbnode_cache_create(void)
+{
+ zcache_rbnode_cache = KMEM_CACHE(zcache_rbnode, 0);
+ return zcache_rbnode_cache == NULL;
+}
+static void zcache_rbnode_cache_destroy(void)
+{
+ kmem_cache_destroy(zcache_rbnode_cache);
+}
+
+static unsigned long zcache_count(struct shrinker *s,
+ struct shrink_control *sc)
+{
+ unsigned long active_file;
+ long file_gap;
+
+ active_file = global_page_state(NR_ACTIVE_FILE);
+ file_gap = zcache_pages() - active_file;
+ if (file_gap < 0)
+ file_gap = 0;
+ return file_gap;
+}
+
+static unsigned long zcache_scan(struct shrinker *s, struct shrink_control *sc)
+{
+ unsigned long active_file;
+ unsigned long file;
+ long file_gap;
+ unsigned long freed = 0;
+ unsigned long pool;
+ static bool running;
+ int i = 0;
+ int retries;
+
+ if (running)
+ goto end;
+
+ running = true;
+ active_file = global_page_state(NR_ACTIVE_FILE);
+ file = global_page_state(NR_FILE_PAGES);
+ pool = zcache_pages();
+
+ file_gap = pool - file;
+
+ if ((file_gap >= 0) &&
+ (totalram_pages * zcache_clear_percent / 100 > file)) {
+ file_gap = pool;
+ zcache_pool_shrink++;
+ goto reclaim;
+ }
+
+ /*
+ * file_gap == 0 means that the number of pages
+ * stored by zcache is around twice as many as the
+ * number of active file pages.
+ */
+ file_gap = pool - active_file;
+ if (file_gap < 0)
+ file_gap = 0;
+ else
+ zcache_pool_shrink++;
+
+reclaim:
+ retries = file_gap;
+ while ((file_gap > 0) && retries) {
+ struct zcache_pool *zpool =
+ zcache.pools[i++ % MAX_ZCACHE_POOLS];
+ if (!zpool || !zpool->size)
+ continue;
+ if (zbud_reclaim_page(zpool->pool, 8)) {
+ zcache_pool_shrink_fail++;
+ retries--;
+ continue;
+ }
+ freed++;
+ file_gap--;
+ }
+
+ zcache_pool_shrink_pages += freed;
+ for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++)
+ zcache.pools[i]->size =
+ zbud_get_pool_size(zcache.pools[i]->pool);
+
+ running = false;
+end:
+ return freed;
+}
+
+static struct shrinker zcache_shrinker = {
+ .scan_objects = zcache_scan,
+ .count_objects = zcache_count,
+ .seeks = DEFAULT_SEEKS * 16
+};
+
+/*
+ * Compression functions
+ * (Below functions are copyed from zswap!)
+ */
+static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms;
+
+enum comp_op {
+ ZCACHE_COMPOP_COMPRESS,
+ ZCACHE_COMPOP_DECOMPRESS
+};
+
+static int zcache_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
+ u8 *dst, unsigned int *dlen)
+{
+ struct crypto_comp *tfm;
+ int ret;
+
+ tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
+ switch (op) {
+ case ZCACHE_COMPOP_COMPRESS:
+ ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
+ break;
+ case ZCACHE_COMPOP_DECOMPRESS:
+ ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ put_cpu();
+ return ret;
+}
+
+static int __init zcache_comp_init(void)
+{
+ if (!crypto_has_comp(zcache_compressor, 0, 0)) {
+ pr_info("%s compressor not available\n", zcache_compressor);
+ /* fall back to default compressor */
+ zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT;
+ if (!crypto_has_comp(zcache_compressor, 0, 0))
+ /* can't even load the default compressor */
+ return -ENODEV;
+ }
+ pr_info("using %s compressor\n", zcache_compressor);
+
+ /* alloc percpu transforms */
+ zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
+ if (!zcache_comp_pcpu_tfms)
+ return -ENOMEM;
+ return 0;
+}
+
+static void zcache_comp_exit(void)
+{
+ /* free percpu transforms */
+ if (zcache_comp_pcpu_tfms)
+ free_percpu(zcache_comp_pcpu_tfms);
+}
+
+/*
+ * Per-cpu code
+ * (Below functions are also copyed from zswap!)
+ */
+static DEFINE_PER_CPU(u8 *, zcache_dstmem);
+
+static int __zcache_cpu_notifier(unsigned long action, unsigned long cpu)
+{
+ struct crypto_comp *tfm;
+ u8 *dst;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ tfm = crypto_alloc_comp(zcache_compressor, 0, 0);
+ if (IS_ERR(tfm)) {
+ pr_err("can't allocate compressor transform\n");
+ return NOTIFY_BAD;
+ }
+ *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
+ dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
+ if (!dst) {
+ pr_err("can't allocate compressor buffer\n");
+ crypto_free_comp(tfm);
+ *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
+ return NOTIFY_BAD;
+ }
+ per_cpu(zcache_dstmem, cpu) = dst;
+ break;
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
+ if (tfm) {
+ crypto_free_comp(tfm);
+ *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
+ }
+ dst = per_cpu(zcache_dstmem, cpu);
+ kfree(dst);
+ per_cpu(zcache_dstmem, cpu) = NULL;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static int zcache_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *pcpu)
+{
+ unsigned long cpu = (unsigned long)pcpu;
+
+ return __zcache_cpu_notifier(action, cpu);
+}
+
+static struct notifier_block zcache_cpu_notifier_block = {
+ .notifier_call = zcache_cpu_notifier
+};
+
+static int zcache_cpu_init(void)
+{
+ unsigned long cpu;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu)
+ if (__zcache_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
+ goto cleanup;
+ register_cpu_notifier(&zcache_cpu_notifier_block);
+ put_online_cpus();
+ return 0;
+
+cleanup:
+ for_each_online_cpu(cpu)
+ __zcache_cpu_notifier(CPU_UP_CANCELED, cpu);
+ put_online_cpus();
+ return -ENOMEM;
+}
+
+/*
+ * Zcache helpers
+ */
+static bool zcache_is_full(void)
+{
+ long file = global_page_state(NR_FILE_PAGES);
+
+ return ((totalram_pages * zcache_max_pool_percent / 100 <
+ zcache_pages()) ||
+ (totalram_pages * zcache_clear_percent / 100 >
+ file));
+}
+
+/*
+ * The caller must hold zpool->rb_lock at least
+ */
+static struct zcache_rbnode *zcache_find_rbnode(struct rb_root *rbtree,
+ int index, struct rb_node **rb_parent, struct rb_node ***rb_link)
+{
+ struct zcache_rbnode *entry;
+ struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+
+ __rb_link = &rbtree->rb_node;
+ rb_prev = __rb_parent = NULL;
+
+ while (*__rb_link) {
+ __rb_parent = *__rb_link;
+ entry = rb_entry(__rb_parent, struct zcache_rbnode, rb_node);
+ if (entry->rb_index > index)
+ __rb_link = &__rb_parent->rb_left;
+ else if (entry->rb_index < index) {
+ rb_prev = __rb_parent;
+ __rb_link = &__rb_parent->rb_right;
+ } else
+ return entry;
+ }
+
+ if (rb_parent)
+ *rb_parent = __rb_parent;
+ if (rb_link)
+ *rb_link = __rb_link;
+ return NULL;
+}
+
+static struct zcache_rbnode *zcache_find_get_rbnode(struct zcache_pool *zpool,
+ int rb_index)
+{
+ unsigned long flags;
+ struct zcache_rbnode *rbnode;
+
+ read_lock_irqsave(&zpool->rb_lock, flags);
+ rbnode = zcache_find_rbnode(&zpool->rbtree, rb_index, 0, 0);
+ if (rbnode)
+ kref_get(&rbnode->refcount);
+ read_unlock_irqrestore(&zpool->rb_lock, flags);
+ return rbnode;
+}
+
+/*
+ * kref_put callback for zcache_rbnode.
+ *
+ * The rbnode must have been isolated from rbtree already.
+ */
+static void zcache_rbnode_release(struct kref *kref)
+{
+ struct zcache_rbnode *rbnode;
+
+ rbnode = container_of(kref, struct zcache_rbnode, refcount);
+ BUG_ON(rbnode->ratree.rnode);
+ kmem_cache_free(zcache_rbnode_cache, rbnode);
+}
+
+/*
+ * Check whether the radix-tree of this rbnode is empty.
+ * If that's true, then we can delete this zcache_rbnode from
+ * zcache_pool->rbtree
+ *
+ * Caller must hold zcache_rbnode->ra_lock
+ */
+static int zcache_rbnode_empty(struct zcache_rbnode *rbnode)
+{
+ return rbnode->ratree.rnode == NULL;
+}
+
+/*
+ * Remove zcache_rbnode from zpool->rbtree
+ *
+ * holded_rblock - whether the caller has holded zpool->rb_lock
+ */
+static void zcache_rbnode_isolate(struct zcache_pool *zpool,
+ struct zcache_rbnode *rbnode, bool holded_rblock)
+{
+ unsigned long flags;
+
+ if (!holded_rblock)
+ write_lock_irqsave(&zpool->rb_lock, flags);
+ /*
+ * Someone can get reference on this rbnode before we could
+ * acquire write lock above.
+ * We want to remove it from zpool->rbtree when only the caller and
+ * corresponding ratree holds a reference to this rbnode.
+ * Below check ensures that a racing zcache put will not end up adding
+ * a page to an isolated node and thereby losing that memory.
+ */
+ if (atomic_read(&rbnode->refcount.refcount) == 2) {
+ rb_erase(&rbnode->rb_node, &zpool->rbtree);
+ RB_CLEAR_NODE(&rbnode->rb_node);
+ kref_put(&rbnode->refcount, zcache_rbnode_release);
+ }
+ if (!holded_rblock)
+ write_unlock_irqrestore(&zpool->rb_lock, flags);
+}
+
+/*
+ * Store zaddr which allocated by zbud_alloc() to the hierarchy rbtree-ratree.
+ */
+static int zcache_store_zaddr(struct zcache_pool *zpool,
+ int ra_index, int rb_index, unsigned long zaddr)
+{
+ unsigned long flags;
+ struct zcache_rbnode *rbnode, *tmp;
+ struct rb_node **link = NULL, *parent = NULL;
+ int ret;
+ void *dup_zaddr;
+
+ rbnode = zcache_find_get_rbnode(zpool, rb_index);
+ if (!rbnode) {
+ /* alloc and init a new rbnode */
+ rbnode = kmem_cache_alloc(zcache_rbnode_cache,
+ GFP_ZCACHE);
+ if (!rbnode)
+ return -ENOMEM;
+
+ INIT_RADIX_TREE(&rbnode->ratree, GFP_ATOMIC|__GFP_NOWARN);
+ spin_lock_init(&rbnode->ra_lock);
+ rbnode->rb_index = rb_index;
+ kref_init(&rbnode->refcount);
+ RB_CLEAR_NODE(&rbnode->rb_node);
+
+ /* add that rbnode to rbtree */
+ write_lock_irqsave(&zpool->rb_lock, flags);
+ tmp = zcache_find_rbnode(&zpool->rbtree, rb_index,
+ &parent, &link);
+ if (tmp) {
+ /* somebody else allocated new rbnode */
+ kmem_cache_free(zcache_rbnode_cache, rbnode);
+ rbnode = tmp;
+ } else {
+ rb_link_node(&rbnode->rb_node, parent, link);
+ rb_insert_color(&rbnode->rb_node, &zpool->rbtree);
+ }
+
+ /* Inc the reference of this zcache_rbnode */
+ kref_get(&rbnode->refcount);
+ write_unlock_irqrestore(&zpool->rb_lock, flags);
+ }
+
+ /* Succfully got a zcache_rbnode when arriving here */
+ spin_lock_irqsave(&rbnode->ra_lock, flags);
+ dup_zaddr = radix_tree_delete(&rbnode->ratree, ra_index);
+ if (unlikely(dup_zaddr)) {
+ WARN_ON("duplicated, will be replaced!\n");
+ if (dup_zaddr == ZERO_HANDLE) {
+ atomic_dec(&zcache_stored_zero_pages);
+ } else {
+ zbud_free(zpool->pool, (unsigned long)dup_zaddr);
+ atomic_dec(&zcache_stored_pages);
+ zpool->size = zbud_get_pool_size(zpool->pool);
+ }
+ zcache_dup_entry++;
+ }
+
+ /* Insert zcache_ra_handle to ratree */
+ ret = radix_tree_insert(&rbnode->ratree, ra_index,
+ (void *)zaddr);
+ spin_unlock_irqrestore(&rbnode->ra_lock, flags);
+ if (unlikely(ret)) {
+ write_lock_irqsave(&zpool->rb_lock, flags);
+ spin_lock(&rbnode->ra_lock);
+
+ if (zcache_rbnode_empty(rbnode))
+ zcache_rbnode_isolate(zpool, rbnode, 1);
+
+ spin_unlock(&rbnode->ra_lock);
+ write_unlock_irqrestore(&zpool->rb_lock, flags);
+ }
+
+ kref_put(&rbnode->refcount, zcache_rbnode_release);
+ return ret;
+}
+
+/*
+ * Load zaddr and delete it from radix tree.
+ * If the radix tree of the corresponding rbnode is empty, delete the rbnode
+ * from zpool->rbtree also.
+ */
+static void *zcache_load_delete_zaddr(struct zcache_pool *zpool,
+ int rb_index, int ra_index)
+{
+ struct zcache_rbnode *rbnode;
+ void *zaddr = NULL;
+ unsigned long flags;
+
+ rbnode = zcache_find_get_rbnode(zpool, rb_index);
+ if (!rbnode)
+ goto out;
+
+ BUG_ON(rbnode->rb_index != rb_index);
+
+ spin_lock_irqsave(&rbnode->ra_lock, flags);
+ zaddr = radix_tree_delete(&rbnode->ratree, ra_index);
+ spin_unlock_irqrestore(&rbnode->ra_lock, flags);
+
+ /* rb_lock and ra_lock must be taken again in the given sequence */
+ write_lock_irqsave(&zpool->rb_lock, flags);
+ spin_lock(&rbnode->ra_lock);
+ if (zcache_rbnode_empty(rbnode))
+ zcache_rbnode_isolate(zpool, rbnode, 1);
+ spin_unlock(&rbnode->ra_lock);
+ write_unlock_irqrestore(&zpool->rb_lock, flags);
+
+ kref_put(&rbnode->refcount, zcache_rbnode_release);
+out:
+ return zaddr;
+}
+
+static bool zero_page(struct page *page)
+{
+ unsigned long *ptr = kmap_atomic(page);
+ int i;
+ bool ret = false;
+
+ for (i = 0; i < PAGE_SIZE / sizeof(*ptr); i++) {
+ if (ptr[i])
+ goto out;
+ }
+ ret = true;
+out:
+ kunmap_atomic(ptr);
+ return ret;
+}
+
+static void zcache_store_page(int pool_id, struct cleancache_filekey key,
+ pgoff_t index, struct page *page)
+{
+ struct zcache_ra_handle *zhandle;
+ u8 *zpage, *src, *dst;
+ /* Address of zhandle + compressed data(zpage) */
+ unsigned long zaddr = 0;
+ unsigned int zlen = PAGE_SIZE;
+ bool zero = 0;
+ int ret;
+
+ struct zcache_pool *zpool = zcache.pools[pool_id];
+
+ /*
+ * Zcache will be ineffective if the compressed memory pool is full with
+ * compressed inactive file pages and most of them will never be used
+ * again.
+ * So we refuse to compress pages that are not from active file list.
+ */
+ if (!PageWasActive(page)) {
+ zcache_inactive_pages_refused++;
+ return;
+ }
+
+ zero = zero_page(page);
+ if (zero)
+ goto zero;
+
+ if (zcache_is_full()) {
+ zcache_pool_limit_hit++;
+ if (zbud_reclaim_page(zpool->pool, 8)) {
+ zcache_reclaim_fail++;
+ return;
+ }
+ /*
+ * Continue if reclaimed a page frame succ.
+ */
+ zcache_evict_filepages++;
+ zpool->size = zbud_get_pool_size(zpool->pool);
+ }
+
+ /* compress */
+ dst = get_cpu_var(zcache_dstmem);
+ src = kmap_atomic(page);
+ ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, src, PAGE_SIZE, dst,
+ &zlen);
+ kunmap_atomic(src);
+ if (ret) {
+ pr_err("zcache compress error ret %d\n", ret);
+ put_cpu_var(zcache_dstmem);
+ return;
+ }
+
+ /* store zcache handle together with compressed page data */
+ ret = zbud_alloc(zpool->pool, zlen + sizeof(struct zcache_ra_handle),
+ GFP_ZCACHE, &zaddr);
+ if (ret) {
+ zcache_zbud_alloc_fail++;
+ put_cpu_var(zcache_dstmem);
+ return;
+ }
+
+ zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, zaddr);
+
+ /* Compressed page data stored at the end of zcache_ra_handle */
+ zpage = (u8 *)(zhandle + 1);
+ memcpy(zpage, dst, zlen);
+ zbud_unmap(zpool->pool, zaddr);
+ put_cpu_var(zcache_dstmem);
+
+zero:
+ if (zero)
+ zaddr = (unsigned long)ZERO_HANDLE;
+
+ /* store zcache handle */
+ ret = zcache_store_zaddr(zpool, index, key.u.ino, zaddr);
+ if (ret) {
+ zcache_store_failed++;
+ if (!zero)
+ zbud_free(zpool->pool, zaddr);
+ return;
+ }
+
+ /* update stats */
+ if (zero) {
+ atomic_inc(&zcache_stored_zero_pages);
+ } else {
+ zhandle->ra_index = index;
+ zhandle->rb_index = key.u.ino;
+ zhandle->zlen = zlen;
+ zhandle->zpool = zpool;
+ atomic_inc(&zcache_stored_pages);
+ zpool->size = zbud_get_pool_size(zpool->pool);
+ }
+
+ return;
+}
+
+static int zcache_load_page(int pool_id, struct cleancache_filekey key,
+ pgoff_t index, struct page *page)
+{
+ int ret = 0;
+ u8 *src, *dst;
+ void *zaddr;
+ unsigned int dlen = PAGE_SIZE;
+ struct zcache_ra_handle *zhandle;
+ struct zcache_pool *zpool = zcache.pools[pool_id];
+
+ zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index);
+ if (!zaddr)
+ return -ENOENT;
+ else if (zaddr == ZERO_HANDLE)
+ goto map;
+
+ zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool,
+ (unsigned long)zaddr);
+ /* Compressed page data stored at the end of zcache_ra_handle */
+ src = (u8 *)(zhandle + 1);
+
+ /* decompress */
+map:
+ dst = kmap_atomic(page);
+ if (zaddr != ZERO_HANDLE) {
+ ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, src,
+ zhandle->zlen, dst, &dlen);
+ } else {
+ memset(dst, 0, PAGE_SIZE);
+ kunmap_atomic(dst);
+ flush_dcache_page(page);
+ atomic_dec(&zcache_stored_zero_pages);
+ goto out;
+ }
+ kunmap_atomic(dst);
+ zbud_unmap(zpool->pool, (unsigned long)zaddr);
+ zbud_free(zpool->pool, (unsigned long)zaddr);
+
+ BUG_ON(ret);
+ BUG_ON(dlen != PAGE_SIZE);
+
+ /* update stats */
+ atomic_dec(&zcache_stored_pages);
+ zpool->size = zbud_get_pool_size(zpool->pool);
+out:
+ SetPageWasActive(page);
+ return ret;
+}
+
+static void zcache_flush_page(int pool_id, struct cleancache_filekey key,
+ pgoff_t index)
+{
+ struct zcache_pool *zpool = zcache.pools[pool_id];
+ void *zaddr = NULL;
+
+ zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index);
+ if (zaddr && (zaddr != ZERO_HANDLE)) {
+ zbud_free(zpool->pool, (unsigned long)zaddr);
+ atomic_dec(&zcache_stored_pages);
+ zpool->size = zbud_get_pool_size(zpool->pool);
+ } else if (zaddr == ZERO_HANDLE) {
+ atomic_dec(&zcache_stored_zero_pages);
+ }
+}
+
+#define FREE_BATCH 16
+/*
+ * Callers must hold the lock
+ */
+static void zcache_flush_ratree(struct zcache_pool *zpool,
+ struct zcache_rbnode *rbnode)
+{
+ unsigned long index = 0;
+ int count, i;
+ struct zcache_ra_handle *zhandle;
+ void *zaddr = NULL;
+
+ do {
+ void *zaddrs[FREE_BATCH];
+ unsigned long indices[FREE_BATCH];
+
+ count = radix_tree_gang_lookup_index(&rbnode->ratree,
+ (void **)zaddrs, indices,
+ index, FREE_BATCH);
+
+ for (i = 0; i < count; i++) {
+ if (zaddrs[i] == ZERO_HANDLE) {
+ zaddr = radix_tree_delete(&rbnode->ratree,
+ indices[i]);
+ if (zaddr)
+ atomic_dec(&zcache_stored_zero_pages);
+ continue;
+ }
+ zhandle = (struct zcache_ra_handle *)zbud_map(
+ zpool->pool, (unsigned long)zaddrs[i]);
+ index = zhandle->ra_index;
+ zaddr = radix_tree_delete(&rbnode->ratree, index);
+ if (!zaddr)
+ continue;
+ zbud_unmap(zpool->pool, (unsigned long)zaddrs[i]);
+ zbud_free(zpool->pool, (unsigned long)zaddrs[i]);
+ atomic_dec(&zcache_stored_pages);
+ zpool->size = zbud_get_pool_size(zpool->pool);
+ }
+
+ index++;
+ } while (count == FREE_BATCH);
+}
+
+static void zcache_flush_inode(int pool_id, struct cleancache_filekey key)
+{
+ struct zcache_rbnode *rbnode;
+ unsigned long flags1, flags2;
+ struct zcache_pool *zpool = zcache.pools[pool_id];
+
+ /*
+ * Refuse new pages added in to the same rbinode, so get rb_lock at
+ * first.
+ */
+ write_lock_irqsave(&zpool->rb_lock, flags1);
+ rbnode = zcache_find_rbnode(&zpool->rbtree, key.u.ino, 0, 0);
+ if (!rbnode) {
+ write_unlock_irqrestore(&zpool->rb_lock, flags1);
+ return;
+ }
+
+ kref_get(&rbnode->refcount);
+ spin_lock_irqsave(&rbnode->ra_lock, flags2);
+
+ zcache_flush_ratree(zpool, rbnode);
+ if (zcache_rbnode_empty(rbnode))
+ /* When arrvied here, we already hold rb_lock */
+ zcache_rbnode_isolate(zpool, rbnode, 1);
+
+ spin_unlock_irqrestore(&rbnode->ra_lock, flags2);
+ write_unlock_irqrestore(&zpool->rb_lock, flags1);
+ kref_put(&rbnode->refcount, zcache_rbnode_release);
+}
+
+static void zcache_destroy_pool(struct zcache_pool *zpool);
+static void zcache_flush_fs(int pool_id)
+{
+ struct zcache_rbnode *z_rbnode = NULL;
+ struct rb_node *rbnode;
+ unsigned long flags1, flags2;
+ struct zcache_pool *zpool;
+
+ if (pool_id < 0)
+ return;
+
+ zpool = zcache.pools[pool_id];
+ if (!zpool)
+ return;
+
+ /*
+ * Refuse new pages added in, so get rb_lock at first.
+ */
+ write_lock_irqsave(&zpool->rb_lock, flags1);
+
+ rbnode = rb_first(&zpool->rbtree);
+ while (rbnode) {
+ z_rbnode = rb_entry(rbnode, struct zcache_rbnode, rb_node);
+ rbnode = rb_next(rbnode);
+ if (z_rbnode) {
+ kref_get(&z_rbnode->refcount);
+ spin_lock_irqsave(&z_rbnode->ra_lock, flags2);
+ zcache_flush_ratree(zpool, z_rbnode);
+ if (zcache_rbnode_empty(z_rbnode))
+ zcache_rbnode_isolate(zpool, z_rbnode, 1);
+ spin_unlock_irqrestore(&z_rbnode->ra_lock, flags2);
+ kref_put(&z_rbnode->refcount, zcache_rbnode_release);
+ }
+ }
+
+ write_unlock_irqrestore(&zpool->rb_lock, flags1);
+ zcache_destroy_pool(zpool);
+}
+
+/*
+ * Evict compressed pages from zcache pool on an LRU basis after the compressed
+ * pool is full.
+ */
+static int zcache_evict_zpage(struct zbud_pool *pool, unsigned long zaddr)
+{
+ struct zcache_pool *zpool;
+ struct zcache_ra_handle *zhandle;
+ void *zaddr_intree;
+
+ BUG_ON(zaddr == (unsigned long)ZERO_HANDLE);
+
+ zhandle = (struct zcache_ra_handle *)zbud_map(pool, zaddr);
+
+ zpool = zhandle->zpool;
+ /* There can be a race with zcache store */
+ if (!zpool)
+ return -EINVAL;
+
+ BUG_ON(pool != zpool->pool);
+
+ zaddr_intree = zcache_load_delete_zaddr(zpool, zhandle->rb_index,
+ zhandle->ra_index);
+ if (zaddr_intree) {
+ BUG_ON((unsigned long)zaddr_intree != zaddr);
+ zbud_unmap(pool, zaddr);
+ zbud_free(pool, zaddr);
+ atomic_dec(&zcache_stored_pages);
+ zpool->size = zbud_get_pool_size(pool);
+ zcache_evict_zpages++;
+ }
+ return 0;
+}
+
+static struct zbud_ops zcache_zbud_ops = {
+ .evict = zcache_evict_zpage
+};
+
+/* Return pool id */
+static int zcache_create_pool(void)
+{
+ int ret;
+ struct zcache_pool *zpool;
+
+ zpool = kzalloc(sizeof(*zpool), GFP_KERNEL);
+ if (!zpool) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ zpool->pool = zbud_create_pool(GFP_KERNEL, &zcache_zbud_ops);
+ if (!zpool->pool) {
+ kfree(zpool);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock(&zcache.pool_lock);
+ if (zcache.num_pools == MAX_ZCACHE_POOLS) {
+ pr_err("Cannot create new pool (limit:%u)\n", MAX_ZCACHE_POOLS);
+ zbud_destroy_pool(zpool->pool);
+ kfree(zpool);
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ rwlock_init(&zpool->rb_lock);
+ zpool->rbtree = RB_ROOT;
+ /* Add to pool list */
+ for (ret = 0; ret < MAX_ZCACHE_POOLS; ret++)
+ if (!zcache.pools[ret])
+ break;
+ zcache.pools[ret] = zpool;
+ zcache.num_pools++;
+ pr_info("New pool created id:%d\n", ret);
+
+out_unlock:
+ spin_unlock(&zcache.pool_lock);
+out:
+ return ret;
+}
+
+static void zcache_destroy_pool(struct zcache_pool *zpool)
+{
+ int i;
+
+ if (!zpool)
+ return;
+
+ spin_lock(&zcache.pool_lock);
+ zcache.num_pools--;
+ for (i = 0; i < MAX_ZCACHE_POOLS; i++)
+ if (zcache.pools[i] == zpool)
+ break;
+ zcache.pools[i] = NULL;
+ spin_unlock(&zcache.pool_lock);
+
+ if (!RB_EMPTY_ROOT(&zpool->rbtree))
+ WARN_ON("Memory leak detected. Freeing non-empty pool!\n");
+
+ zbud_destroy_pool(zpool->pool);
+ kfree(zpool);
+}
+
+static int zcache_init_fs(size_t pagesize)
+{
+ int ret;
+
+ if (pagesize != PAGE_SIZE) {
+ pr_info("Unsupported page size: %zu", pagesize);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = zcache_create_pool();
+ if (ret < 0) {
+ pr_info("Failed to create new pool\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+out:
+ return ret;
+}
+
+static int zcache_init_shared_fs(char *uuid, size_t pagesize)
+{
+ /* shared pools are unsupported and map to private */
+ return zcache_init_fs(pagesize);
+}
+
+static struct cleancache_ops zcache_ops = {
+ .put_page = zcache_store_page,
+ .get_page = zcache_load_page,
+ .invalidate_page = zcache_flush_page,
+ .invalidate_inode = zcache_flush_inode,
+ .invalidate_fs = zcache_flush_fs,
+ .init_shared_fs = zcache_init_shared_fs,
+ .init_fs = zcache_init_fs
+};
+
+/*
+ * Debugfs functions
+ */
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static int pool_pages_get(void *_data, u64 *val)
+{
+ *val = zcache_pages();
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(pool_page_fops, pool_pages_get, NULL, "%llu\n");
+
+static struct dentry *zcache_debugfs_root;
+
+static int __init zcache_debugfs_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zcache_debugfs_root = debugfs_create_dir("zcache", NULL);
+ if (!zcache_debugfs_root)
+ return -ENOMEM;
+
+ debugfs_create_u64("pool_limit_hit", S_IRUGO, zcache_debugfs_root,
+ &zcache_pool_limit_hit);
+ debugfs_create_u64("reject_alloc_fail", S_IRUGO, zcache_debugfs_root,
+ &zcache_zbud_alloc_fail);
+ debugfs_create_u64("duplicate_entry", S_IRUGO, zcache_debugfs_root,
+ &zcache_dup_entry);
+ debugfs_create_file("pool_pages", S_IRUGO, zcache_debugfs_root, NULL,
+ &pool_page_fops);
+ debugfs_create_atomic_t("stored_pages", S_IRUGO, zcache_debugfs_root,
+ &zcache_stored_pages);
+ debugfs_create_atomic_t("stored_zero_pages", S_IRUGO,
+ zcache_debugfs_root, &zcache_stored_zero_pages);
+ debugfs_create_u64("evicted_zpages", S_IRUGO, zcache_debugfs_root,
+ &zcache_evict_zpages);
+ debugfs_create_u64("evicted_filepages", S_IRUGO, zcache_debugfs_root,
+ &zcache_evict_filepages);
+ debugfs_create_u64("reclaim_fail", S_IRUGO, zcache_debugfs_root,
+ &zcache_reclaim_fail);
+ debugfs_create_u64("inactive_pages_refused", S_IRUGO,
+ zcache_debugfs_root, &zcache_inactive_pages_refused);
+ debugfs_create_u64("pool_shrink_count", S_IRUGO,
+ zcache_debugfs_root, &zcache_pool_shrink);
+ debugfs_create_u64("pool_shrink_fail", S_IRUGO,
+ zcache_debugfs_root, &zcache_pool_shrink_fail);
+ debugfs_create_u64("pool_shrink_pages", S_IRUGO,
+ zcache_debugfs_root, &zcache_pool_shrink_pages);
+ debugfs_create_u64("store_fail", S_IRUGO,
+ zcache_debugfs_root, &zcache_store_failed);
+ return 0;
+}
+
+static void __exit zcache_debugfs_exit(void)
+{
+ debugfs_remove_recursive(zcache_debugfs_root);
+}
+#else
+static int __init zcache_debugfs_init(void)
+{
+ return 0;
+}
+static void __exit zcache_debugfs_exit(void)
+{
+}
+#endif
+
+/*
+ * zcache init and exit
+ */
+static int __init init_zcache(void)
+{
+ if (!zcache_enabled)
+ return 0;
+
+ pr_info("loading zcache..\n");
+ if (zcache_rbnode_cache_create()) {
+ pr_err("entry cache creation failed\n");
+ goto error;
+ }
+
+ if (zcache_comp_init()) {
+ pr_err("compressor initialization failed\n");
+ goto compfail;
+ }
+ if (zcache_cpu_init()) {
+ pr_err("per-cpu initialization failed\n");
+ goto pcpufail;
+ }
+
+ spin_lock_init(&zcache.pool_lock);
+ cleancache_register_ops(&zcache_ops);
+
+ if (zcache_debugfs_init())
+ pr_warn("debugfs initialization failed\n");
+ register_shrinker(&zcache_shrinker);
+ return 0;
+pcpufail:
+ zcache_comp_exit();
+compfail:
+ zcache_rbnode_cache_destroy();
+error:
+ return -ENOMEM;
+}
+
+/* must be late so crypto has time to come up */
+late_initcall(init_zcache);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bob Liu <bob.liu@xxxxxxxxxx>");
+MODULE_DESCRIPTION("Compressed cache for clean file pages");
+
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c1ea19478119..3f1b584bd5d0 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -16,32 +16,15 @@
* struct page(s) to form a zspage.
*
* Usage of struct page fields:
- * page->private: points to the first component (0-order) page
- * page->index (union with page->freelist): offset of the first object
- * starting in this page. For the first page, this is
- * always 0, so we use this field (aka freelist) to point
- * to the first free object in zspage.
- * page->lru: links together all component pages (except the first page)
- * of a zspage
- *
- * For _first_ page only:
- *
- * page->private: refers to the component page after the first page
- * If the page is first_page for huge object, it stores handle.
- * Look at size_class->huge.
- * page->freelist: points to the first free object in zspage.
- * Free objects are linked together using in-place
- * metadata.
- * page->objects: maximum number of objects we can store in this
- * zspage (class->zspage_order * PAGE_SIZE / class->size)
- * page->lru: links together first pages of various zspages.
- * Basically forming list of zspages in a fullness group.
- * page->mapping: class index and fullness group of the zspage
- * page->inuse: the number of objects that are used in this zspage
+ * page->private: points to zspage
+ * page->freelist(index): links together all component pages of a zspage
+ * For the huge page, this is always 0, so we use this field
+ * to store handle.
*
* Usage of struct page flags:
* PG_private: identifies the first component page
* PG_private2: identifies the last component page
+ * PG_owner_priv_1: indentifies the huge component page
*
*/
@@ -64,6 +47,11 @@
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
+#include <linux/mount.h>
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+
+#define ZSPAGE_MAGIC 0x58
/*
* This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -86,9 +74,7 @@
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
*
- * Note that object index <obj_idx> is relative to system
- * page <PFN> it is stored in, so for each sub-page belonging
- * to a zspage, obj_idx starts with 0.
+ * Note that object index <obj_idx> starts from 0.
*
* This is made more complicated by various memory models and PAE.
*/
@@ -147,33 +133,29 @@
* ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
* (reason above)
*/
-#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8)
+#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
/*
* We do not maintain any list for completely empty or full pages
*/
enum fullness_group {
- ZS_ALMOST_FULL,
- ZS_ALMOST_EMPTY,
- _ZS_NR_FULLNESS_GROUPS,
-
ZS_EMPTY,
- ZS_FULL
+ ZS_ALMOST_EMPTY,
+ ZS_ALMOST_FULL,
+ ZS_FULL,
+ NR_ZS_FULLNESS,
};
enum zs_stat_type {
+ CLASS_EMPTY,
+ CLASS_ALMOST_EMPTY,
+ CLASS_ALMOST_FULL,
+ CLASS_FULL,
OBJ_ALLOCATED,
OBJ_USED,
- CLASS_ALMOST_FULL,
- CLASS_ALMOST_EMPTY,
+ NR_ZS_STAT_TYPE,
};
-#ifdef CONFIG_ZSMALLOC_STAT
-#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1)
-#else
-#define NR_ZS_STAT_TYPE (OBJ_USED + 1)
-#endif
-
struct zs_size_stat {
unsigned long objs[NR_ZS_STAT_TYPE];
};
@@ -182,6 +164,10 @@ struct zs_size_stat {
static struct dentry *zs_stat_root;
#endif
+#ifdef CONFIG_COMPACTION
+static struct vfsmount *zsmalloc_mnt;
+#endif
+
/*
* number of size_classes
*/
@@ -202,38 +188,53 @@ static int zs_size_classes;
* (see: fix_fullness_group())
*/
static const int fullness_threshold_frac = 4;
+static size_t huge_class_size;
struct size_class {
spinlock_t lock;
- struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+ struct list_head fullness_list[NR_ZS_FULLNESS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
*/
int size;
- unsigned int index;
-
+ int objs_per_zspage;
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
- struct zs_size_stat stats;
- /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
- bool huge;
+ unsigned int index;
+ struct zs_size_stat stats;
};
+/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+static void SetPageHugeObject(struct page *page)
+{
+ SetPageOwnerPriv1(page);
+}
+
+static void ClearPageHugeObject(struct page *page)
+{
+ ClearPageOwnerPriv1(page);
+}
+
+static int PageHugeObject(struct page *page)
+{
+ return PageOwnerPriv1(page);
+}
+
/*
* Placed within free objects to form a singly linked list.
- * For every zspage, first_page->freelist gives head of this list.
+ * For every zspage, zspage->freeobj gives head of this list.
*
* This must be power of 2 and less than or equal to ZS_ALIGN
*/
struct link_free {
union {
/*
- * Position of next free chunk (encodes <PFN, obj_idx>)
+ * Free object index;
* It's valid for non-allocated object
*/
- void *next;
+ unsigned long next;
/*
* Handle of allocated object.
*/
@@ -246,8 +247,8 @@ struct zs_pool {
struct size_class **size_class;
struct kmem_cache *handle_cachep;
+ struct kmem_cache *zspage_cachep;
- gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
struct zs_pool_stats stats;
@@ -262,16 +263,36 @@ struct zs_pool {
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
+#ifdef CONFIG_COMPACTION
+ struct inode *inode;
+ struct work_struct free_work;
+#endif
};
/*
* A zspage's class index and fullness group
* are encoded in its (first)page->mapping
*/
-#define CLASS_IDX_BITS 28
-#define FULLNESS_BITS 4
-#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
-#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1)
+#define FULLNESS_BITS 2
+#define CLASS_BITS 8
+#define ISOLATED_BITS 3
+#define MAGIC_VAL_BITS 8
+
+struct zspage {
+ struct {
+ unsigned int fullness:FULLNESS_BITS;
+ unsigned int class:CLASS_BITS;
+ unsigned int isolated:ISOLATED_BITS;
+ unsigned int magic:MAGIC_VAL_BITS;
+ };
+ unsigned int inuse;
+ unsigned int freeobj;
+ struct page *first_page;
+ struct list_head list; /* fullness list */
+#ifdef CONFIG_COMPACTION
+ rwlock_t lock;
+#endif
+};
struct mapping_area {
#ifdef CONFIG_PGTABLE_MAPPING
@@ -281,32 +302,76 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
- bool huge;
};
-static int create_handle_cache(struct zs_pool *pool)
+#ifdef CONFIG_COMPACTION
+static int zs_register_migration(struct zs_pool *pool);
+static void zs_unregister_migration(struct zs_pool *pool);
+static void migrate_lock_init(struct zspage *zspage);
+static void migrate_read_lock(struct zspage *zspage);
+static void migrate_read_unlock(struct zspage *zspage);
+static void kick_deferred_free(struct zs_pool *pool);
+static void init_deferred_free(struct zs_pool *pool);
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
+#else
+static int zsmalloc_mount(void) { return 0; }
+static void zsmalloc_unmount(void) {}
+static int zs_register_migration(struct zs_pool *pool) { return 0; }
+static void zs_unregister_migration(struct zs_pool *pool) {}
+static void migrate_lock_init(struct zspage *zspage) {}
+static void migrate_read_lock(struct zspage *zspage) {}
+static void migrate_read_unlock(struct zspage *zspage) {}
+static void kick_deferred_free(struct zs_pool *pool) {}
+static void init_deferred_free(struct zs_pool *pool) {}
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
+#endif
+
+static int create_cache(struct zs_pool *pool)
{
pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
0, 0, NULL);
- return pool->handle_cachep ? 0 : 1;
+ if (!pool->handle_cachep)
+ return 1;
+
+ pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
+ 0, 0, NULL);
+ if (!pool->zspage_cachep) {
+ kmem_cache_destroy(pool->handle_cachep);
+ pool->handle_cachep = NULL;
+ return 1;
+ }
+
+ return 0;
}
-static void destroy_handle_cache(struct zs_pool *pool)
+static void destroy_cache(struct zs_pool *pool)
{
kmem_cache_destroy(pool->handle_cachep);
+ kmem_cache_destroy(pool->zspage_cachep);
}
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- pool->flags & ~__GFP_HIGHMEM);
+ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
}
-static void free_handle(struct zs_pool *pool, unsigned long handle)
+static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
{
kmem_cache_free(pool->handle_cachep, (void *)handle);
}
+static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
+{
+ return kmem_cache_alloc(pool->zspage_cachep,
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+};
+
+static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
+{
+ kmem_cache_free(pool->zspage_cachep, zspage);
+}
+
static void record_obj(unsigned long handle, unsigned long obj)
{
/*
@@ -325,7 +390,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
- return zs_create_pool(name, gfp);
+ /*
+ * Ignore global gfp flags: zs_malloc() may be invoked from
+ * different contexts and its caller must provide a valid
+ * gfp mask.
+ */
+ return zs_create_pool(name);
}
static void zs_zpool_destroy(void *pool)
@@ -336,7 +406,7 @@ static void zs_zpool_destroy(void *pool)
static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
- *handle = zs_malloc(pool, size);
+ *handle = zs_malloc(pool, size, gfp);
return *handle ? 0 : -1;
}
static void zs_zpool_free(void *pool, unsigned long handle)
@@ -404,36 +474,76 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
-static int is_first_page(struct page *page)
+static bool is_zspage_isolated(struct zspage *zspage)
+{
+ return zspage->isolated;
+}
+
+static __maybe_unused int is_first_page(struct page *page)
{
return PagePrivate(page);
}
-static int is_last_page(struct page *page)
+/* Protected by class->lock */
+static inline int get_zspage_inuse(struct zspage *zspage)
+{
+ return zspage->inuse;
+}
+
+static inline void set_zspage_inuse(struct zspage *zspage, int val)
+{
+ zspage->inuse = val;
+}
+
+static inline void mod_zspage_inuse(struct zspage *zspage, int val)
+{
+ zspage->inuse += val;
+}
+
+static inline struct page *get_first_page(struct zspage *zspage)
+{
+ struct page *first_page = zspage->first_page;
+
+ VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+ return first_page;
+}
+
+static inline int get_first_obj_offset(struct page *page)
+{
+ return page->units;
+}
+
+static inline void set_first_obj_offset(struct page *page, int offset)
+{
+ page->units = offset;
+}
+
+static inline unsigned int get_freeobj(struct zspage *zspage)
+{
+ return zspage->freeobj;
+}
+
+static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
{
- return PagePrivate2(page);
+ zspage->freeobj = obj;
}
-static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+static void get_zspage_mapping(struct zspage *zspage,
+ unsigned int *class_idx,
enum fullness_group *fullness)
{
- unsigned long m;
- BUG_ON(!is_first_page(page));
+ BUG_ON(zspage->magic != ZSPAGE_MAGIC);
- m = (unsigned long)page->mapping;
- *fullness = m & FULLNESS_MASK;
- *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
+ *fullness = zspage->fullness;
+ *class_idx = zspage->class;
}
-static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+static void set_zspage_mapping(struct zspage *zspage,
+ unsigned int class_idx,
enum fullness_group fullness)
{
- unsigned long m;
- BUG_ON(!is_first_page(page));
-
- m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
- (fullness & FULLNESS_MASK);
- page->mapping = (struct address_space *)m;
+ zspage->class = class_idx;
+ zspage->fullness = fullness;
}
/*
@@ -454,26 +564,25 @@ static int get_size_class_index(int size)
return min(zs_size_classes - 1, idx);
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
+ int type, unsigned long cnt)
{
- if (type < NR_ZS_STAT_TYPE)
- class->stats.objs[type] += cnt;
+ class->stats.objs[type] += cnt;
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
+ int type, unsigned long cnt)
{
- if (type < NR_ZS_STAT_TYPE)
- class->stats.objs[type] -= cnt;
+ class->stats.objs[type] -= cnt;
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
+ int type)
{
- if (type < NR_ZS_STAT_TYPE)
- return class->stats.objs[type];
- return 0;
+ return class->stats.objs[type];
}
#ifdef CONFIG_ZSMALLOC_STAT
@@ -495,6 +604,8 @@ static void __exit zs_stat_exit(void)
debugfs_remove_recursive(zs_stat_root);
}
+static unsigned long zs_can_compact(struct size_class *class);
+
static int zs_stats_size_show(struct seq_file *s, void *v)
{
int i;
@@ -502,14 +613,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
struct size_class *class;
int objs_per_zspage;
unsigned long class_almost_full, class_almost_empty;
- unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long obj_allocated, obj_used, pages_used, freeable;
unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+ unsigned long total_freeable = 0;
- seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
"class", "size", "almost_full", "almost_empty",
"obj_allocated", "obj_used", "pages_used",
- "pages_per_zspage");
+ "pages_per_zspage", "freeable");
for (i = 0; i < zs_size_classes; i++) {
class = pool->size_class[i];
@@ -522,6 +634,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
obj_used = zs_stat_get(class, OBJ_USED);
+ freeable = zs_can_compact(class);
spin_unlock(&class->lock);
objs_per_zspage = get_maxobj_per_zspage(class->size,
@@ -529,23 +642,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
- seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu"
+ " %10lu %10lu %16d %8lu\n",
i, class->size, class_almost_full, class_almost_empty,
obj_allocated, obj_used, pages_used,
- class->pages_per_zspage);
+ class->pages_per_zspage, freeable);
total_class_almost_full += class_almost_full;
total_class_almost_empty += class_almost_empty;
total_objs += obj_allocated;
total_used_objs += obj_used;
total_pages += pages_used;
+ total_freeable += freeable;
}
seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
"Total", "", total_class_almost_full,
total_class_almost_empty, total_objs,
- total_used_objs, total_pages);
+ total_used_objs, total_pages, "", total_freeable);
return 0;
}
@@ -562,7 +677,7 @@ static const struct file_operations zs_stat_size_ops = {
.release = single_release,
};
-static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
+static int zs_pool_stat_create(struct zs_pool *pool, const char *name)
{
struct dentry *entry;
@@ -602,7 +717,7 @@ static void __exit zs_stat_exit(void)
{
}
-static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool)
+static inline int zs_pool_stat_create(struct zs_pool *pool, const char *name)
{
return 0;
}
@@ -620,20 +735,20 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
* the pool (not yet implemented). This function returns fullness
* status of the given page.
*/
-static enum fullness_group get_fullness_group(struct page *page)
+static enum fullness_group get_fullness_group(struct size_class *class,
+ struct zspage *zspage)
{
- int inuse, max_objects;
+ int inuse, objs_per_zspage;
enum fullness_group fg;
- BUG_ON(!is_first_page(page));
- inuse = page->inuse;
- max_objects = page->objects;
+ inuse = get_zspage_inuse(zspage);
+ objs_per_zspage = class->objs_per_zspage;
if (inuse == 0)
fg = ZS_EMPTY;
- else if (inuse == max_objects)
+ else if (inuse == objs_per_zspage)
fg = ZS_FULL;
- else if (inuse <= 3 * max_objects / fullness_threshold_frac)
+ else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
fg = ZS_ALMOST_EMPTY;
else
fg = ZS_ALMOST_FULL;
@@ -647,59 +762,41 @@ static enum fullness_group get_fullness_group(struct page *page)
* have. This functions inserts the given zspage into the freelist
* identified by <class, fullness_group>.
*/
-static void insert_zspage(struct page *page, struct size_class *class,
+static void insert_zspage(struct size_class *class,
+ struct zspage *zspage,
enum fullness_group fullness)
{
- struct page **head;
-
- BUG_ON(!is_first_page(page));
-
- if (fullness >= _ZS_NR_FULLNESS_GROUPS)
- return;
-
- zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
- CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
-
- head = &class->fullness_list[fullness];
- if (!*head) {
- *head = page;
- return;
- }
+ struct zspage *head;
+ zs_stat_inc(class, fullness, 1);
+ head = list_first_entry_or_null(&class->fullness_list[fullness],
+ struct zspage, list);
/*
- * We want to see more ZS_FULL pages and less almost
- * empty/full. Put pages with higher ->inuse first.
+ * We want to see more ZS_FULL pages and less almost empty/full.
+ * Put pages with higher ->inuse first.
*/
- list_add_tail(&page->lru, &(*head)->lru);
- if (page->inuse >= (*head)->inuse)
- *head = page;
+ if (head) {
+ if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
+ list_add(&zspage->list, &head->list);
+ return;
+ }
+ }
+ list_add(&zspage->list, &class->fullness_list[fullness]);
}
/*
* This function removes the given zspage from the freelist identified
* by <class, fullness_group>.
*/
-static void remove_zspage(struct page *page, struct size_class *class,
+static void remove_zspage(struct size_class *class,
+ struct zspage *zspage,
enum fullness_group fullness)
{
- struct page **head;
+ VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
+ VM_BUG_ON(is_zspage_isolated(zspage));
- BUG_ON(!is_first_page(page));
-
- if (fullness >= _ZS_NR_FULLNESS_GROUPS)
- return;
-
- head = &class->fullness_list[fullness];
- BUG_ON(!*head);
- if (list_empty(&(*head)->lru))
- *head = NULL;
- else if (*head == page)
- *head = (struct page *)list_entry((*head)->lru.next,
- struct page, lru);
-
- list_del_init(&page->lru);
- zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
- CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+ list_del_init(&zspage->list);
+ zs_stat_dec(class, fullness, 1);
}
/*
@@ -712,21 +809,22 @@ static void remove_zspage(struct page *page, struct size_class *class,
* fullness group.
*/
static enum fullness_group fix_fullness_group(struct size_class *class,
- struct page *page)
+ struct zspage *zspage)
{
int class_idx;
enum fullness_group currfg, newfg;
- BUG_ON(!is_first_page(page));
-
- get_zspage_mapping(page, &class_idx, &currfg);
- newfg = get_fullness_group(page);
+ get_zspage_mapping(zspage, &class_idx, &currfg);
+ newfg = get_fullness_group(class, zspage);
if (newfg == currfg)
goto out;
- remove_zspage(page, class, currfg);
- insert_zspage(page, class, newfg);
- set_zspage_mapping(page, class_idx, newfg);
+ if (!is_zspage_isolated(zspage)) {
+ remove_zspage(class, zspage, currfg);
+ insert_zspage(class, zspage, newfg);
+ }
+
+ set_zspage_mapping(zspage, class_idx, newfg);
out:
return newfg;
@@ -768,64 +866,49 @@ static int get_pages_per_zspage(int class_size)
return max_usedpc_order;
}
-/*
- * A single 'zspage' is composed of many system pages which are
- * linked together using fields in struct page. This function finds
- * the first/head page, given any component page of a zspage.
- */
-static struct page *get_first_page(struct page *page)
+static struct zspage *get_zspage(struct page *page)
{
- if (is_first_page(page))
- return page;
- else
- return (struct page *)page_private(page);
+ struct zspage *zspage = (struct zspage *)page->private;
+
+ BUG_ON(zspage->magic != ZSPAGE_MAGIC);
+ return zspage;
}
static struct page *get_next_page(struct page *page)
{
- struct page *next;
+ if (unlikely(PageHugeObject(page)))
+ return NULL;
- if (is_last_page(page))
- next = NULL;
- else if (is_first_page(page))
- next = (struct page *)page_private(page);
- else
- next = list_entry(page->lru.next, struct page, lru);
+ return page->freelist;
+}
- return next;
+/**
+ * obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * @page: page object resides in zspage
+ * @obj_idx: object index
+ */
+static void obj_to_location(unsigned long obj, struct page **page,
+ unsigned int *obj_idx)
+{
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *obj_idx = (obj & OBJ_INDEX_MASK);
}
-/*
- * Encode <page, obj_idx> as a single handle value.
- * We use the least bit of handle for tagging.
+/**
+ * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
+ * @page: page object resides in zspage
+ * @obj_idx: object index
*/
-static void *location_to_obj(struct page *page, unsigned long obj_idx)
+static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
{
unsigned long obj;
- if (!page) {
- BUG_ON(obj_idx);
- return NULL;
- }
-
obj = page_to_pfn(page) << OBJ_INDEX_BITS;
- obj |= ((obj_idx) & OBJ_INDEX_MASK);
+ obj |= obj_idx & OBJ_INDEX_MASK;
obj <<= OBJ_TAG_BITS;
- return (void *)obj;
-}
-
-/*
- * Decode <page, obj_idx> pair from the given object handle. We adjust the
- * decoded obj_idx back to its original value since it was adjusted in
- * location_to_obj().
- */
-static void obj_to_location(unsigned long obj, struct page **page,
- unsigned long *obj_idx)
-{
- obj >>= OBJ_TAG_BITS;
- *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
- *obj_idx = (obj & OBJ_INDEX_MASK);
+ return obj;
}
static unsigned long handle_to_obj(unsigned long handle)
@@ -833,108 +916,146 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static unsigned long obj_to_head(struct size_class *class, struct page *page,
- void *obj)
+static unsigned long obj_to_head(struct page *page, void *obj)
{
- if (class->huge) {
- VM_BUG_ON(!is_first_page(page));
- return page_private(page);
+ if (unlikely(PageHugeObject(page))) {
+ VM_BUG_ON_PAGE(!is_first_page(page), page);
+ return page->index;
} else
return *(unsigned long *)obj;
}
-static unsigned long obj_idx_to_offset(struct page *page,
- unsigned long obj_idx, int class_size)
+static inline int testpin_tag(unsigned long handle)
{
- unsigned long off = 0;
-
- if (!is_first_page(page))
- off = page->index;
-
- return off + obj_idx * class_size;
+ return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
}
static inline int trypin_tag(unsigned long handle)
{
- unsigned long *ptr = (unsigned long *)handle;
-
- return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+ return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
static void pin_tag(unsigned long handle)
{
- while (!trypin_tag(handle));
+ bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
static void unpin_tag(unsigned long handle)
{
- unsigned long *ptr = (unsigned long *)handle;
-
- clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+ bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
}
static void reset_page(struct page *page)
{
+ __ClearPageMovable(page);
clear_bit(PG_private, &page->flags);
clear_bit(PG_private_2, &page->flags);
set_page_private(page, 0);
- page->mapping = NULL;
- page->freelist = NULL;
page_mapcount_reset(page);
+ ClearPageHugeObject(page);
+ page->freelist = NULL;
}
-static void free_zspage(struct page *first_page)
+/*
+ * To prevent zspage destroy during migration, zspage freeing should
+ * hold locks of all pages in the zspage.
+ */
+void lock_zspage(struct zspage *zspage)
{
- struct page *nextp, *tmp, *head_extra;
+ struct page *page = get_first_page(zspage);
- BUG_ON(!is_first_page(first_page));
- BUG_ON(first_page->inuse);
+ do {
+ lock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
- head_extra = (struct page *)page_private(first_page);
+int trylock_zspage(struct zspage *zspage)
+{
+ struct page *cursor, *fail;
- reset_page(first_page);
- __free_page(first_page);
+ for (cursor = get_first_page(zspage); cursor != NULL; cursor =
+ get_next_page(cursor)) {
+ if (!trylock_page(cursor)) {
+ fail = cursor;
+ goto unlock;
+ }
+ }
- /* zspage with only 1 system page */
- if (!head_extra)
- return;
+ return 1;
+unlock:
+ for (cursor = get_first_page(zspage); cursor != fail; cursor =
+ get_next_page(cursor))
+ unlock_page(cursor);
+
+ return 0;
+}
+
+static void __free_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ struct page *page, *next;
+ enum fullness_group fg;
+ unsigned int class_idx;
+
+ get_zspage_mapping(zspage, &class_idx, &fg);
+
+ assert_spin_locked(&class->lock);
+
+ VM_BUG_ON(get_zspage_inuse(zspage));
+ VM_BUG_ON(fg != ZS_EMPTY);
+
+ next = page = get_first_page(zspage);
+ do {
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ next = get_next_page(page);
+ reset_page(page);
+ unlock_page(page);
+ put_page(page);
+ page = next;
+ } while (page != NULL);
+
+ cache_free_zspage(pool, zspage);
+
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
+}
- list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
- list_del(&nextp->lru);
- reset_page(nextp);
- __free_page(nextp);
+static void free_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zspage *zspage)
+{
+ VM_BUG_ON(get_zspage_inuse(zspage));
+ VM_BUG_ON(list_empty(&zspage->list));
+
+ if (!trylock_zspage(zspage)) {
+ kick_deferred_free(pool);
+ return;
}
- reset_page(head_extra);
- __free_page(head_extra);
+
+ remove_zspage(class, zspage, ZS_EMPTY);
+ __free_zspage(pool, class, zspage);
}
/* Initialize a newly allocated zspage */
-static void init_zspage(struct page *first_page, struct size_class *class)
+static void init_zspage(struct size_class *class, struct zspage *zspage)
{
+ unsigned int freeobj = 1;
unsigned long off = 0;
- struct page *page = first_page;
+ struct page *page = get_first_page(zspage);
- BUG_ON(!is_first_page(first_page));
while (page) {
struct page *next_page;
struct link_free *link;
- unsigned int i = 1;
void *vaddr;
- /*
- * page->index stores offset of first object starting
- * in the page. For the first page, this is always 0,
- * so we use first_page->index (aka ->freelist) to store
- * head of corresponding zspage's freelist.
- */
- if (page != first_page)
- page->index = off;
+ set_first_obj_offset(page, off);
vaddr = kmap_atomic(page);
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
- link->next = location_to_obj(page, i++);
+ link->next = freeobj++ << OBJ_ALLOCATED_TAG;
link += class->size / sizeof(*link);
}
@@ -944,87 +1065,108 @@ static void init_zspage(struct page *first_page, struct size_class *class)
* page (if present)
*/
next_page = get_next_page(page);
- link->next = location_to_obj(next_page, 0);
+ if (next_page) {
+ link->next = freeobj++ << OBJ_ALLOCATED_TAG;
+ } else {
+ /*
+ * Reset OBJ_ALLOCATED_TAG bit to last link to tell
+ * whether it's allocated object or not.
+ */
+ link->next = -1 << OBJ_ALLOCATED_TAG;
+ }
kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
}
+
+ set_freeobj(zspage, 0);
}
-/*
- * Allocate a zspage for the given size class
- */
-static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
+static void create_page_chain(struct size_class *class, struct zspage *zspage,
+ struct page *pages[])
{
- int i, error;
- struct page *first_page = NULL, *uninitialized_var(prev_page);
+ int i;
+ struct page *page;
+ struct page *prev_page = NULL;
+ int nr_pages = class->pages_per_zspage;
/*
* Allocate individual pages and link them together as:
- * 1. first page->private = first sub-page
- * 2. all sub-pages are linked together using page->lru
- * 3. each sub-page is linked to the first page using page->private
+ * 1. all pages are linked together using page->freelist
+ * 2. each sub-page point to zspage using page->private
*
- * For each size class, First/Head pages are linked together using
- * page->lru. Also, we set PG_private to identify the first page
- * (i.e. no other sub-page has this flag set) and PG_private_2 to
- * identify the last page.
+ * we set PG_private to identify the first page (i.e. no other sub-page
+ * has this flag set) and PG_private_2 to identify the last page.
*/
- error = -ENOMEM;
- for (i = 0; i < class->pages_per_zspage; i++) {
- struct page *page;
-
- page = alloc_page(flags);
- if (!page)
- goto cleanup;
-
- INIT_LIST_HEAD(&page->lru);
- if (i == 0) { /* first page */
+ for (i = 0; i < nr_pages; i++) {
+ page = pages[i];
+ set_page_private(page, (unsigned long)zspage);
+ page->freelist = NULL;
+ if (i == 0) {
+ zspage->first_page = page;
SetPagePrivate(page);
- set_page_private(page, 0);
- first_page = page;
- first_page->inuse = 0;
+ if (unlikely(class->objs_per_zspage == 1 &&
+ class->pages_per_zspage == 1))
+ SetPageHugeObject(page);
+ } else {
+ prev_page->freelist = page;
}
- if (i == 1)
- set_page_private(first_page, (unsigned long)page);
- if (i >= 1)
- set_page_private(page, (unsigned long)first_page);
- if (i >= 2)
- list_add(&page->lru, &prev_page->lru);
- if (i == class->pages_per_zspage - 1) /* last page */
+ if (i == nr_pages - 1)
SetPagePrivate2(page);
prev_page = page;
}
+}
- init_zspage(first_page, class);
+/*
+ * Allocate a zspage for the given size class
+ */
+static struct zspage *alloc_zspage(struct zs_pool *pool,
+ struct size_class *class,
+ gfp_t gfp)
+{
+ int i;
+ struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
+ struct zspage *zspage = cache_alloc_zspage(pool, gfp);
- first_page->freelist = location_to_obj(first_page, 0);
- /* Maximum number of objects we can store in this zspage */
- first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
+ if (!zspage)
+ return NULL;
- error = 0; /* Success */
+ memset(zspage, 0, sizeof(struct zspage));
+ zspage->magic = ZSPAGE_MAGIC;
+ migrate_lock_init(zspage);
-cleanup:
- if (unlikely(error) && first_page) {
- free_zspage(first_page);
- first_page = NULL;
+ for (i = 0; i < class->pages_per_zspage; i++) {
+ struct page *page;
+
+ page = alloc_page(gfp);
+ if (!page) {
+ while (--i >= 0)
+ __free_page(pages[i]);
+ cache_free_zspage(pool, zspage);
+ return NULL;
+ }
+ pages[i] = page;
}
- return first_page;
+ create_page_chain(class, zspage, pages);
+ init_zspage(class, zspage);
+
+ return zspage;
}
-static struct page *find_get_zspage(struct size_class *class)
+static struct zspage *find_get_zspage(struct size_class *class)
{
int i;
- struct page *page;
+ struct zspage *zspage;
- for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
- page = class->fullness_list[i];
- if (page)
+ for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
+ zspage = list_first_entry_or_null(&class->fullness_list[i],
+ struct zspage, list);
+ if (zspage)
break;
}
- return page;
+ return zspage;
}
#ifdef CONFIG_PGTABLE_MAPPING
@@ -1127,11 +1269,9 @@ static void __zs_unmap_object(struct mapping_area *area,
goto out;
buf = area->vm_buf;
- if (!area->huge) {
- buf = buf + ZS_HANDLE_SIZE;
- size -= ZS_HANDLE_SIZE;
- off += ZS_HANDLE_SIZE;
- }
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
@@ -1231,11 +1371,9 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
-static bool zspage_full(struct page *page)
+static bool zspage_full(struct size_class *class, struct zspage *zspage)
{
- BUG_ON(!is_first_page(page));
-
- return page->inuse == page->objects;
+ return get_zspage_inuse(zspage) == class->objs_per_zspage;
}
unsigned long zs_get_total_pages(struct zs_pool *pool)
@@ -1261,8 +1399,10 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm)
{
+ struct zspage *zspage;
struct page *page;
- unsigned long obj, obj_idx, off;
+ unsigned long obj, off;
+ unsigned int obj_idx;
unsigned int class_idx;
enum fullness_group fg;
@@ -1271,23 +1411,26 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
struct page *pages[2];
void *ret;
- BUG_ON(!handle);
-
/*
* Because we use per-cpu mapping areas shared among the
* pools/users, we can't allow mapping in interrupt context
* because it can corrupt another users mappings.
*/
- BUG_ON(in_interrupt());
+ WARN_ON_ONCE(in_interrupt());
/* From now on, migration cannot move the object */
pin_tag(handle);
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ zspage = get_zspage(page);
+
+ /* migration cannot move any subpage in this zspage */
+ migrate_read_lock(zspage);
+
+ get_zspage_mapping(zspage, &class_idx, &fg);
class = pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ off = (class->size * obj_idx) & ~PAGE_MASK;
area = &get_cpu_var(zs_map_area);
area->vm_mm = mm;
@@ -1305,7 +1448,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
ret = __zs_map_object(area, pages, off, class->size);
out:
- if (!class->huge)
+ if (likely(!PageHugeObject(page)))
ret += ZS_HANDLE_SIZE;
return ret;
@@ -1314,21 +1457,22 @@ EXPORT_SYMBOL_GPL(zs_map_object);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
+ struct zspage *zspage;
struct page *page;
- unsigned long obj, obj_idx, off;
+ unsigned long obj, off;
+ unsigned int obj_idx;
unsigned int class_idx;
enum fullness_group fg;
struct size_class *class;
struct mapping_area *area;
- BUG_ON(!handle);
-
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
- get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+ zspage = get_zspage(page);
+ get_zspage_mapping(zspage, &class_idx, &fg);
class = pool->size_class[class_idx];
- off = obj_idx_to_offset(page, obj_idx, class->size);
+ off = (class->size * obj_idx) & ~PAGE_MASK;
area = this_cpu_ptr(&zs_map_area);
if (off + class->size <= PAGE_SIZE)
@@ -1343,38 +1487,69 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
put_cpu_var(zs_map_area);
+
+ migrate_read_unlock(zspage);
unpin_tag(handle);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
-static unsigned long obj_malloc(struct page *first_page,
- struct size_class *class, unsigned long handle)
+/**
+ * zs_huge_class_size() - Returns the size (in bytes) of the first huge
+ * zsmalloc &size_class.
+ * @pool: zsmalloc pool to use
+ *
+ * The function returns the size of the first huge class - any object of equal
+ * or bigger size will be stored in zspage consisting of a single physical
+ * page.
+ *
+ * Context: Any context.
+ *
+ * Return: the size (in bytes) of the first huge zsmalloc &size_class.
+ */
+size_t zs_huge_class_size(struct zs_pool *pool)
{
+ return huge_class_size;
+}
+EXPORT_SYMBOL_GPL(zs_huge_class_size);
+
+static unsigned long obj_malloc(struct size_class *class,
+ struct zspage *zspage, unsigned long handle)
+{
+ int i, nr_page, offset;
unsigned long obj;
struct link_free *link;
struct page *m_page;
- unsigned long m_objidx, m_offset;
+ unsigned long m_offset;
void *vaddr;
handle |= OBJ_ALLOCATED_TAG;
- obj = (unsigned long)first_page->freelist;
- obj_to_location(obj, &m_page, &m_objidx);
- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+ obj = get_freeobj(zspage);
+
+ offset = obj * class->size;
+ nr_page = offset >> PAGE_SHIFT;
+ m_offset = offset & ~PAGE_MASK;
+ m_page = get_first_page(zspage);
+
+ for (i = 0; i < nr_page; i++)
+ m_page = get_next_page(m_page);
vaddr = kmap_atomic(m_page);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
- first_page->freelist = link->next;
- if (!class->huge)
+ set_freeobj(zspage, link->next >> OBJ_ALLOCATED_TAG);
+ if (likely(!PageHugeObject(m_page)))
/* record handle in the header of allocated chunk */
link->handle = handle;
else
- /* record handle in first_page->private */
- set_page_private(first_page, handle);
+ /* record handle to page->index */
+ zspage->first_page->index = handle;
+
kunmap_atomic(vaddr);
- first_page->inuse++;
+ mod_zspage_inuse(zspage, 1);
zs_stat_inc(class, OBJ_USED, 1);
+ obj = location_to_obj(m_page, obj);
+
return obj;
}
@@ -1388,16 +1563,17 @@ static unsigned long obj_malloc(struct page *first_page,
* otherwise 0.
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
{
unsigned long handle, obj;
struct size_class *class;
- struct page *first_page;
+ enum fullness_group newfg;
+ struct zspage *zspage;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
- handle = alloc_handle(pool);
+ handle = cache_alloc_handle(pool, gfp);
if (!handle)
return 0;
@@ -1406,71 +1582,79 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
class = pool->size_class[get_size_class_index(size)];
spin_lock(&class->lock);
- first_page = find_get_zspage(class);
-
- if (!first_page) {
+ zspage = find_get_zspage(class);
+ if (likely(zspage)) {
+ obj = obj_malloc(class, zspage, handle);
+ /* Now move the zspage to another fullness group, if required */
+ fix_fullness_group(class, zspage);
+ record_obj(handle, obj);
spin_unlock(&class->lock);
- first_page = alloc_zspage(class, pool->flags);
- if (unlikely(!first_page)) {
- free_handle(pool, handle);
- return 0;
- }
- set_zspage_mapping(first_page, class->index, ZS_EMPTY);
- atomic_long_add(class->pages_per_zspage,
- &pool->pages_allocated);
+ return handle;
+ }
- spin_lock(&class->lock);
- zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
- class->size, class->pages_per_zspage));
+ spin_unlock(&class->lock);
+
+ zspage = alloc_zspage(pool, class, gfp);
+ if (!zspage) {
+ cache_free_handle(pool, handle);
+ return 0;
}
- obj = obj_malloc(first_page, class, handle);
- /* Now move the zspage to another fullness group, if required */
- fix_fullness_group(class, first_page);
+ spin_lock(&class->lock);
+ obj = obj_malloc(class, zspage, handle);
+ newfg = get_fullness_group(class, zspage);
+ insert_zspage(class, zspage, newfg);
+ set_zspage_mapping(zspage, class->index, newfg);
record_obj(handle, obj);
+ atomic_long_add(class->pages_per_zspage,
+ &pool->pages_allocated);
+ zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+
+ /* We completely set up zspage so mark them as movable */
+ SetZsPageMovable(pool, zspage);
spin_unlock(&class->lock);
return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(struct zs_pool *pool, struct size_class *class,
- unsigned long obj)
+static void obj_free(struct size_class *class, unsigned long obj)
{
struct link_free *link;
- struct page *first_page, *f_page;
- unsigned long f_objidx, f_offset;
+ struct zspage *zspage;
+ struct page *f_page;
+ unsigned long f_offset;
+ unsigned int f_objidx;
void *vaddr;
- BUG_ON(!obj);
-
obj &= ~OBJ_ALLOCATED_TAG;
obj_to_location(obj, &f_page, &f_objidx);
- first_page = get_first_page(f_page);
-
- f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+ f_offset = (class->size * f_objidx) & ~PAGE_MASK;
+ zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset);
- link->next = first_page->freelist;
- if (class->huge)
- set_page_private(first_page, 0);
+ link->next = get_freeobj(zspage) << OBJ_ALLOCATED_TAG;
kunmap_atomic(vaddr);
- first_page->freelist = (void *)obj;
- first_page->inuse--;
+ set_freeobj(zspage, f_objidx);
+ mod_zspage_inuse(zspage, -1);
zs_stat_dec(class, OBJ_USED, 1);
}
void zs_free(struct zs_pool *pool, unsigned long handle)
{
- struct page *first_page, *f_page;
- unsigned long obj, f_objidx;
+ struct zspage *zspage;
+ struct page *f_page;
+ unsigned long obj;
+ unsigned int f_objidx;
int class_idx;
struct size_class *class;
enum fullness_group fullness;
+ bool isolated;
if (unlikely(!handle))
return;
@@ -1478,33 +1662,39 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
pin_tag(handle);
obj = handle_to_obj(handle);
obj_to_location(obj, &f_page, &f_objidx);
- first_page = get_first_page(f_page);
+ zspage = get_zspage(f_page);
+
+ migrate_read_lock(zspage);
- get_zspage_mapping(first_page, &class_idx, &fullness);
+ get_zspage_mapping(zspage, &class_idx, &fullness);
class = pool->size_class[class_idx];
spin_lock(&class->lock);
- obj_free(pool, class, obj);
- fullness = fix_fullness_group(class, first_page);
- if (fullness == ZS_EMPTY) {
- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
- class->size, class->pages_per_zspage));
- atomic_long_sub(class->pages_per_zspage,
- &pool->pages_allocated);
- free_zspage(first_page);
+ obj_free(class, obj);
+ fullness = fix_fullness_group(class, zspage);
+ if (fullness != ZS_EMPTY) {
+ migrate_read_unlock(zspage);
+ goto out;
}
+
+ isolated = is_zspage_isolated(zspage);
+ migrate_read_unlock(zspage);
+ /* If zspage is isolated, zs_page_putback will free the zspage */
+ if (likely(!isolated))
+ free_zspage(pool, class, zspage);
+out:
+
spin_unlock(&class->lock);
unpin_tag(handle);
-
- free_handle(pool, handle);
+ cache_free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
-static void zs_object_copy(unsigned long dst, unsigned long src,
- struct size_class *class)
+static void zs_object_copy(struct size_class *class, unsigned long dst,
+ unsigned long src)
{
struct page *s_page, *d_page;
- unsigned long s_objidx, d_objidx;
+ unsigned int s_objidx, d_objidx;
unsigned long s_off, d_off;
void *s_addr, *d_addr;
int s_size, d_size, size;
@@ -1515,8 +1705,8 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
obj_to_location(src, &s_page, &s_objidx);
obj_to_location(dst, &d_page, &d_objidx);
- s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
- d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+ s_off = (class->size * s_objidx) & ~PAGE_MASK;
+ d_off = (class->size * d_objidx) & ~PAGE_MASK;
if (s_off + class->size > PAGE_SIZE)
s_size = PAGE_SIZE - s_off;
@@ -1544,7 +1734,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
kunmap_atomic(d_addr);
kunmap_atomic(s_addr);
s_page = get_next_page(s_page);
- BUG_ON(!s_page);
s_addr = kmap_atomic(s_page);
d_addr = kmap_atomic(d_page);
s_size = class->size - written;
@@ -1554,7 +1743,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
if (d_off >= PAGE_SIZE) {
kunmap_atomic(d_addr);
d_page = get_next_page(d_page);
- BUG_ON(!d_page);
d_addr = kmap_atomic(d_page);
d_size = class->size - written;
d_off = 0;
@@ -1569,20 +1757,19 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
* Find alloced object in zspage from index object and
* return handle.
*/
-static unsigned long find_alloced_obj(struct page *page, int index,
- struct size_class *class)
+static unsigned long find_alloced_obj(struct size_class *class,
+ struct page *page, int index)
{
unsigned long head;
int offset = 0;
unsigned long handle = 0;
void *addr = kmap_atomic(page);
- if (!is_first_page(page))
- offset = page->index;
+ offset = get_first_obj_offset(page);
offset += class->size * index;
while (offset < PAGE_SIZE) {
- head = obj_to_head(class, page, addr + offset);
+ head = obj_to_head(page, addr + offset);
if (head & OBJ_ALLOCATED_TAG) {
handle = head & ~OBJ_ALLOCATED_TAG;
if (trypin_tag(handle))
@@ -1599,7 +1786,7 @@ static unsigned long find_alloced_obj(struct page *page, int index,
}
struct zs_compact_control {
- /* Source page for migration which could be a subpage of zspage. */
+ /* Source spage for migration which could be a subpage of zspage */
struct page *s_page;
/* Destination page for migration which should be a first page
* of zspage. */
@@ -1620,7 +1807,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
int ret = 0;
while (1) {
- handle = find_alloced_obj(s_page, index, class);
+ handle = find_alloced_obj(class, s_page, index);
if (!handle) {
s_page = get_next_page(s_page);
if (!s_page)
@@ -1630,15 +1817,15 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
}
/* Stop if there is no more space */
- if (zspage_full(d_page)) {
+ if (zspage_full(class, get_zspage(d_page))) {
unpin_tag(handle);
ret = -ENOMEM;
break;
}
used_obj = handle_to_obj(handle);
- free_obj = obj_malloc(d_page, class, handle);
- zs_object_copy(free_obj, used_obj, class);
+ free_obj = obj_malloc(class, get_zspage(d_page), handle);
+ zs_object_copy(class, free_obj, used_obj);
index++;
/*
* record_obj updates handle's value to free_obj and it will
@@ -1649,7 +1836,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
free_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, free_obj);
unpin_tag(handle);
- obj_free(pool, class, used_obj);
+ obj_free(class, used_obj);
}
/* Remember last position in this iteration */
@@ -1659,71 +1846,423 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
return ret;
}
-static struct page *isolate_target_page(struct size_class *class)
+static struct zspage *isolate_zspage(struct size_class *class, bool source)
{
int i;
- struct page *page;
+ struct zspage *zspage;
+ enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
- for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
- page = class->fullness_list[i];
- if (page) {
- remove_zspage(page, class, i);
- break;
+ if (!source) {
+ fg[0] = ZS_ALMOST_FULL;
+ fg[1] = ZS_ALMOST_EMPTY;
+ }
+
+ for (i = 0; i < 2; i++) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
+ struct zspage, list);
+ if (zspage) {
+ VM_BUG_ON(is_zspage_isolated(zspage));
+ remove_zspage(class, zspage, fg[i]);
+ return zspage;
}
}
- return page;
+ return zspage;
}
/*
- * putback_zspage - add @first_page into right class's fullness list
- * @pool: target pool
+ * putback_zspage - add @zspage into right class's fullness list
* @class: destination class
- * @first_page: target page
+ * @zspage: target page
*
- * Return @fist_page's fullness_group
+ * Return @zspage's fullness_group
*/
-static enum fullness_group putback_zspage(struct zs_pool *pool,
- struct size_class *class,
- struct page *first_page)
+static enum fullness_group putback_zspage(struct size_class *class,
+ struct zspage *zspage)
{
enum fullness_group fullness;
- BUG_ON(!is_first_page(first_page));
+ VM_BUG_ON(is_zspage_isolated(zspage));
- fullness = get_fullness_group(first_page);
- insert_zspage(first_page, class, fullness);
- set_zspage_mapping(first_page, class->index, fullness);
+ fullness = get_fullness_group(class, zspage);
+ insert_zspage(class, zspage, fullness);
+ set_zspage_mapping(zspage, class->index, fullness);
- if (fullness == ZS_EMPTY) {
- zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
- class->size, class->pages_per_zspage));
- atomic_long_sub(class->pages_per_zspage,
- &pool->pages_allocated);
+ return fullness;
+}
+
+#ifdef CONFIG_COMPACTION
+static struct dentry *zs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ static const struct dentry_operations ops = {
+ .d_dname = simple_dname,
+ };
+
+ return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+}
+
+static struct file_system_type zsmalloc_fs = {
+ .name = "zsmalloc",
+ .mount = zs_mount,
+ .kill_sb = kill_anon_super,
+};
+
+static int zsmalloc_mount(void)
+{
+ int ret = 0;
+
+ zsmalloc_mnt = kern_mount(&zsmalloc_fs);
+ if (IS_ERR(zsmalloc_mnt))
+ ret = PTR_ERR(zsmalloc_mnt);
+
+ return ret;
+}
+
+static void zsmalloc_unmount(void)
+{
+ kern_unmount(zsmalloc_mnt);
+}
+
+static void migrate_lock_init(struct zspage *zspage)
+{
+ rwlock_init(&zspage->lock);
+}
+
+static void migrate_read_lock(struct zspage *zspage)
+{
+ read_lock(&zspage->lock);
+}
- free_zspage(first_page);
+static void migrate_read_unlock(struct zspage *zspage)
+{
+ read_unlock(&zspage->lock);
+}
+
+static void migrate_write_lock(struct zspage *zspage)
+{
+ write_lock(&zspage->lock);
+}
+
+static void migrate_write_unlock(struct zspage *zspage)
+{
+ write_unlock(&zspage->lock);
+}
+
+/* Number of isolated subpage for *page migration* in this zspage */
+static void inc_zspage_isolation(struct zspage *zspage)
+{
+ zspage->isolated++;
+}
+
+static void dec_zspage_isolation(struct zspage *zspage)
+{
+ zspage->isolated--;
+}
+
+static void replace_sub_page(struct size_class *class, struct zspage *zspage,
+ struct page *newpage, struct page *oldpage)
+{
+ struct page *page;
+ struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+ int idx = 0;
+
+ page = get_first_page(zspage);
+ do {
+ if (page == oldpage)
+ pages[idx] = newpage;
+ else
+ pages[idx] = page;
+ idx++;
+ } while ((page = get_next_page(page)) != NULL);
+
+ create_page_chain(class, zspage, pages);
+ set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
+ if (unlikely(PageHugeObject(oldpage)))
+ newpage->index = oldpage->index;
+ __SetPageMovable(newpage, page_mapping(oldpage));
+}
+
+bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage;
+ struct address_space *mapping;
+
+ /*
+ * Page is locked so zspage couldn't be destroyed. For detail, look at
+ * lock_zspage in free_zspage.
+ */
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+
+ /*
+ * Without class lock, fullness could be stale while class_idx is okay
+ * because class_idx is constant unless page is freed so we should get
+ * fullness again under class lock.
+ */
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ mapping = page_mapping(page);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ if (get_zspage_inuse(zspage) == 0) {
+ spin_unlock(&class->lock);
+ return false;
}
- return fullness;
+ /* zspage is isolated for object migration */
+ if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+ spin_unlock(&class->lock);
+ return false;
+ }
+
+ /*
+ * If this is first time isolation for the zspage, isolate zspage from
+ * size_class to prevent further object allocation from the zspage.
+ */
+ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ remove_zspage(class, zspage, fullness);
+ }
+
+ inc_zspage_isolation(zspage);
+ spin_unlock(&class->lock);
+
+ return true;
+}
+
+int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage;
+ struct page *dummy;
+ void *s_addr, *d_addr, *addr;
+ int offset, pos;
+ unsigned long handle, head;
+ unsigned long old_obj, new_obj;
+ unsigned int obj_idx;
+ int ret = -EAGAIN;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+
+ /* Concurrent compactor cannot migrate any subpage in zspage */
+ migrate_write_lock(zspage);
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+ offset = get_first_obj_offset(page);
+
+ spin_lock(&class->lock);
+ if (!get_zspage_inuse(zspage)) {
+ ret = -EBUSY;
+ goto unlock_class;
+ }
+
+ pos = offset;
+ s_addr = kmap_atomic(page);
+ while (pos < PAGE_SIZE) {
+ head = obj_to_head(page, s_addr + pos);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!trypin_tag(handle))
+ goto unpin_objects;
+ }
+ pos += class->size;
+ }
+
+ /*
+ * Here, any user cannot access all objects in the zspage so let's move.
+ */
+ d_addr = kmap_atomic(newpage);
+ memcpy(d_addr, s_addr, PAGE_SIZE);
+ kunmap_atomic(d_addr);
+
+ for (addr = s_addr + offset; addr < s_addr + pos;
+ addr += class->size) {
+ head = obj_to_head(page, addr);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!testpin_tag(handle))
+ BUG();
+
+ old_obj = handle_to_obj(handle);
+ obj_to_location(old_obj, &dummy, &obj_idx);
+ new_obj = (unsigned long)location_to_obj(newpage,
+ obj_idx);
+ new_obj |= BIT(HANDLE_PIN_BIT);
+ record_obj(handle, new_obj);
+ }
+ }
+
+ replace_sub_page(class, zspage, newpage, page);
+ get_page(newpage);
+
+ dec_zspage_isolation(zspage);
+
+ /*
+ * Page migration is done so let's putback isolated zspage to
+ * the list if @page is final isolated subpage in the zspage.
+ */
+ if (!is_zspage_isolated(zspage))
+ putback_zspage(class, zspage);
+
+ reset_page(page);
+ put_page(page);
+ page = newpage;
+
+ ret = MIGRATEPAGE_SUCCESS;
+unpin_objects:
+ for (addr = s_addr + offset; addr < s_addr + pos;
+ addr += class->size) {
+ head = obj_to_head(page, addr);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (!testpin_tag(handle))
+ BUG();
+ unpin_tag(handle);
+ }
+ }
+ kunmap_atomic(s_addr);
+unlock_class:
+ spin_unlock(&class->lock);
+ migrate_write_unlock(zspage);
+
+ return ret;
+}
+
+void zs_page_putback(struct page *page)
+{
+ struct zs_pool *pool;
+ struct size_class *class;
+ int class_idx;
+ enum fullness_group fg;
+ struct address_space *mapping;
+ struct zspage *zspage;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zspage = get_zspage(page);
+ get_zspage_mapping(zspage, &class_idx, &fg);
+ mapping = page_mapping(page);
+ pool = mapping->private_data;
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ dec_zspage_isolation(zspage);
+ if (!is_zspage_isolated(zspage)) {
+ fg = putback_zspage(class, zspage);
+ /*
+ * Due to page_lock, we cannot free zspage immediately
+ * so let's defer.
+ */
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+ }
+ spin_unlock(&class->lock);
}
-static struct page *isolate_source_page(struct size_class *class)
+const struct address_space_operations zsmalloc_aops = {
+ .isolate_page = zs_page_isolate,
+ .migratepage = zs_page_migrate,
+ .putback_page = zs_page_putback,
+};
+
+static int zs_register_migration(struct zs_pool *pool)
+{
+ pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
+ if (IS_ERR(pool->inode)) {
+ pool->inode = NULL;
+ return 1;
+ }
+
+ pool->inode->i_mapping->private_data = pool;
+ pool->inode->i_mapping->a_ops = &zsmalloc_aops;
+ return 0;
+}
+
+static void zs_unregister_migration(struct zs_pool *pool)
+{
+ flush_work(&pool->free_work);
+ if (pool->inode)
+ iput(pool->inode);
+}
+
+/*
+ * Caller should hold page_lock of all pages in the zspage
+ * In here, we cannot use zspage meta data.
+ */
+static void async_free_zspage(struct work_struct *work)
{
int i;
- struct page *page = NULL;
+ struct size_class *class;
+ unsigned int class_idx;
+ enum fullness_group fullness;
+ struct zspage *zspage, *tmp;
+ LIST_HEAD(free_pages);
+ struct zs_pool *pool = container_of(work, struct zs_pool,
+ free_work);
- for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
- page = class->fullness_list[i];
- if (!page)
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+ if (class->index != i)
continue;
- remove_zspage(page, class, i);
- break;
+ spin_lock(&class->lock);
+ list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
+ spin_unlock(&class->lock);
}
- return page;
+
+ list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
+ list_del(&zspage->list);
+ lock_zspage(zspage);
+
+ get_zspage_mapping(zspage, &class_idx, &fullness);
+ VM_BUG_ON(fullness != ZS_EMPTY);
+ class = pool->size_class[class_idx];
+ spin_lock(&class->lock);
+ __free_zspage(pool, pool->size_class[class_idx], zspage);
+ spin_unlock(&class->lock);
+ }
+};
+
+static void kick_deferred_free(struct zs_pool *pool)
+{
+ schedule_work(&pool->free_work);
}
+static void init_deferred_free(struct zs_pool *pool)
+{
+ INIT_WORK(&pool->free_work, async_free_zspage);
+}
+
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
+{
+ struct page *page = get_first_page(zspage);
+
+ do {
+ WARN_ON(!trylock_page(page));
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ } while ((page = get_next_page(page)) != NULL);
+}
+#endif
+
/*
*
* Based on the number of unused allocated objects calculate
@@ -1748,22 +2287,20 @@ static unsigned long zs_can_compact(struct size_class *class)
static void __zs_compact(struct zs_pool *pool, struct size_class *class)
{
struct zs_compact_control cc;
- struct page *src_page;
- struct page *dst_page = NULL;
+ struct zspage *src_zspage;
+ struct zspage *dst_zspage = NULL;
spin_lock(&class->lock);
- while ((src_page = isolate_source_page(class))) {
-
- BUG_ON(!is_first_page(src_page));
+ while ((src_zspage = isolate_zspage(class, true))) {
if (!zs_can_compact(class))
break;
cc.index = 0;
- cc.s_page = src_page;
+ cc.s_page = get_first_page(src_zspage);
- while ((dst_page = isolate_target_page(class))) {
- cc.d_page = dst_page;
+ while ((dst_zspage = isolate_zspage(class, false))) {
+ cc.d_page = get_first_page(dst_zspage);
/*
* If there is no more space in dst_page, resched
* and see if anyone had allocated another zspage.
@@ -1771,23 +2308,25 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
if (!migrate_zspage(pool, class, &cc))
break;
- putback_zspage(pool, class, dst_page);
+ putback_zspage(class, dst_zspage);
}
/* Stop if we couldn't find slot */
- if (dst_page == NULL)
+ if (dst_zspage == NULL)
break;
- putback_zspage(pool, class, dst_page);
- if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+ putback_zspage(class, dst_zspage);
+ if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
+ free_zspage(pool, class, src_zspage);
pool->stats.pages_compacted += class->pages_per_zspage;
+ }
spin_unlock(&class->lock);
cond_resched();
spin_lock(&class->lock);
}
- if (src_page)
- putback_zspage(pool, class, src_page);
+ if (src_zspage)
+ putback_zspage(class, src_zspage);
spin_unlock(&class->lock);
}
@@ -1884,7 +2423,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name)
{
int i;
struct zs_pool *pool;
@@ -1894,6 +2433,7 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
if (!pool)
return NULL;
+ init_deferred_free(pool);
pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
GFP_KERNEL);
if (!pool->size_class) {
@@ -1905,7 +2445,7 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
if (!pool->name)
goto err;
- if (create_handle_cache(pool))
+ if (create_cache(pool))
goto err;
/*
@@ -1915,12 +2455,36 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
for (i = zs_size_classes - 1; i >= 0; i--) {
int size;
int pages_per_zspage;
+ int objs_per_zspage;
struct size_class *class;
+ int fullness = 0;
size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
if (size > ZS_MAX_ALLOC_SIZE)
size = ZS_MAX_ALLOC_SIZE;
pages_per_zspage = get_pages_per_zspage(size);
+ objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
+
+ /*
+ * We iterate from biggest down to smallest classes,
+ * so huge_class_size holds the size of the first huge
+ * class. Any object bigger than or equal to that will
+ * endup in the huge class.
+ */
+ if (pages_per_zspage != 1 && objs_per_zspage != 1 &&
+ !huge_class_size) {
+ huge_class_size = size;
+ /*
+ * The object uses ZS_HANDLE_SIZE bytes to store the
+ * handle. We need to subtract it, because zs_malloc()
+ * unconditionally adds handle size before it performs
+ * size class search - so object may be smaller than
+ * huge class size, yet it still can end up in the huge
+ * class because it grows by ZS_HANDLE_SIZE extra bytes
+ * right before class lookup.
+ */
+ huge_class_size -= (ZS_HANDLE_SIZE - 1);
+ }
/*
* size_class is used for normal zsmalloc operation such
@@ -1945,18 +2509,21 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
class->size = size;
class->index = i;
class->pages_per_zspage = pages_per_zspage;
- if (pages_per_zspage == 1 &&
- get_maxobj_per_zspage(size, pages_per_zspage) == 1)
- class->huge = true;
+ class->objs_per_zspage = class->pages_per_zspage *
+ PAGE_SIZE / class->size;
spin_lock_init(&class->lock);
pool->size_class[i] = class;
+ for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
+ fullness++)
+ INIT_LIST_HEAD(&class->fullness_list[fullness]);
prev_class = class;
}
- pool->flags = flags;
+ if (zs_pool_stat_create(pool, name))
+ goto err;
- if (zs_pool_stat_create(name, pool))
+ if (zs_register_migration(pool))
goto err;
/*
@@ -1978,6 +2545,7 @@ void zs_destroy_pool(struct zs_pool *pool)
int i;
zs_unregister_shrinker(pool);
+ zs_unregister_migration(pool);
zs_pool_stat_destroy(pool);
for (i = 0; i < zs_size_classes; i++) {
@@ -1990,8 +2558,8 @@ void zs_destroy_pool(struct zs_pool *pool)
if (class->index != i)
continue;
- for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
- if (class->fullness_list[fg]) {
+ for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
+ if (!list_empty(&class->fullness_list[fg])) {
pr_info("Freeing non-empty class with size %db, fullness group %d\n",
class->size, fg);
}
@@ -1999,7 +2567,7 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
- destroy_handle_cache(pool);
+ destroy_cache(pool);
kfree(pool->size_class);
kfree(pool->name);
kfree(pool);
@@ -2008,7 +2576,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
static int __init zs_init(void)
{
- int ret = zs_register_cpu_notifier();
+ int ret;
+
+ ret = zsmalloc_mount();
+ if (ret)
+ goto out;
+
+ ret = zs_register_cpu_notifier();
if (ret)
goto notifier_fail;
@@ -2032,7 +2606,8 @@ stat_fail:
#endif
notifier_fail:
zs_unregister_cpu_notifier();
-
+ zsmalloc_unmount();
+out:
return ret;
}
@@ -2041,6 +2616,7 @@ static void __exit zs_exit(void)
#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zs_zpool_driver);
#endif
+ zsmalloc_unmount();
zs_unregister_cpu_notifier();
zs_stat_exit();