summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c74
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c73
-rw-r--r--mm/highmem.c66
-rw-r--r--mm/hugetlb.c238
-rw-r--r--mm/internal.h2
-rw-r--r--mm/maccess.c2
-rw-r--r--mm/memcontrol.c422
-rw-r--r--mm/memory-failure.c176
-rw-r--r--mm/memory.c35
-rw-r--r--mm/memory_hotplug.c48
-rw-r--r--mm/mempolicy.c17
-rw-r--r--mm/migrate.c249
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c51
-rw-r--r--mm/oom_kill.c33
-rw-r--r--mm/page-writeback.c31
-rw-r--r--mm/page_alloc.c99
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/rmap.c37
-rw-r--r--mm/shmem.c17
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swapfile.c49
-rw-r--r--mm/vmalloc.c56
-rw-r--r--mm/vmscan.c218
-rw-r--r--mm/vmstat.c44
29 files changed, 1475 insertions, 578 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..027100d30227 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -74,11 +74,11 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
spin_lock(&inode_lock);
- list_for_each_entry(inode, &wb->b_dirty, i_list)
+ list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
+ list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
+ list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
spin_unlock(&inode_lock);
@@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
{
struct bdi_writeback *me = ptr;
- current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+ current->flags |= PF_SWAPWRITE;
set_freezable();
/*
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
+static atomic_t nr_bdi_congested[2];
void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
wait_queue_head_t *wqh = &congestion_wqh[sync];
bit = sync ? BDI_sync_congested : BDI_async_congested;
- clear_bit(bit, &bdi->state);
+ if (test_and_clear_bit(bit, &bdi->state))
+ atomic_dec(&nr_bdi_congested[sync]);
smp_mb__after_clear_bit();
if (waitqueue_active(wqh))
wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
enum bdi_state bit;
bit = sync ? BDI_sync_congested : BDI_async_congested;
- set_bit(bit, &bdi->state);
+ if (!test_and_set_bit(bit, &bdi->state))
+ atomic_inc(&nr_bdi_congested[sync]);
}
EXPORT_SYMBOL(set_bdi_congested);
@@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
long congestion_wait(int sync, long timeout)
{
long ret;
+ unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
+
+ trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
return ret;
}
EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+ long ret;
+ unsigned long start = jiffies;
+ DEFINE_WAIT(wait);
+ wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+ /*
+ * If there is no congestion, or heavy congestion is not being
+ * encountered in the current zone, yield if necessary instead
+ * of sleeping on the congestion queue
+ */
+ if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ !zone_is_reclaim_congested(zone)) {
+ cond_resched();
+
+ /* In case we scheduled, work out time remaining */
+ ret = timeout - (jiffies - start);
+ if (ret < 0)
+ ret = 0;
+
+ goto out;
+ }
+
+ /* Sleep until uncongested or a write happens */
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = io_schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+out:
+ trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+ jiffies_to_usecs(jiffies - start));
+
+ return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..4df2de77e069 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
size_t offset;
void *retval;
+ might_sleep_if(mem_flags & __GFP_WAIT);
+
spin_lock_irqsave(&pool->lock, flags);
restart:
list_for_each_entry(page, &pool->page_list, page_list) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..ea89840fc65f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page)
TASK_UNINTERRUPTIBLE);
}
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+ unsigned int flags)
+{
+ if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
+ __lock_page(page);
+ return 1;
+ } else {
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(page);
+ return 0;
+ }
+}
+
/**
* find_get_page - find and get a page reference
* @mapping: the address_space to search
@@ -631,7 +644,9 @@ repeat:
pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
if (pagep) {
page = radix_tree_deref_slot(pagep);
- if (unlikely(!page || page == RADIX_TREE_RETRY))
+ if (unlikely(!page))
+ goto out;
+ if (radix_tree_deref_retry(page))
goto repeat;
if (!page_cache_get_speculative(page))
@@ -647,6 +662,7 @@ repeat:
goto repeat;
}
}
+out:
rcu_read_unlock();
return page;
@@ -764,12 +780,11 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page)) {
+ if (ret)
+ start = pages[ret-1]->index;
goto restart;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -817,11 +832,7 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
if (page->mapping == NULL || page->index != index)
@@ -874,11 +885,7 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
if (!page_cache_get_speculative(page))
@@ -1016,6 +1023,9 @@ find_page:
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
desc, offset))
goto page_not_up_to_date_locked;
@@ -1539,25 +1549,30 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* waiting for the lock.
*/
do_async_mmap_readahead(vma, ra, file, page, offset);
- lock_page(page);
-
- /* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto no_cached_page;
- }
} else {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
- page = find_lock_page(mapping, offset);
+ page = find_get_page(mapping, offset);
if (!page)
goto no_cached_page;
}
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ page_cache_release(page);
+ return ret | VM_FAULT_RETRY;
+ }
+
+ /* Did it get truncated? */
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ put_page(page);
+ goto retry_find;
+ }
+ VM_BUG_ON(page->index != offset);
+
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -2177,12 +2192,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
}
if (written > 0) {
- loff_t end = pos + written;
- if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
- i_size_write(inode, end);
+ pos += written;
+ if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+ i_size_write(inode, pos);
mark_inode_dirty(inode);
}
- *ppos = end;
+ *ppos = pos;
}
out:
return written;
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be4993..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,6 +29,11 @@
#include <linux/kgdb.h>
#include <asm/tlbflush.h>
+
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#endif
+
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -42,6 +47,9 @@
unsigned long totalhigh_pages __read_mostly;
EXPORT_SYMBOL(totalhigh_pages);
+
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
+
unsigned int nr_free_highpages (void)
{
pg_data_t *pgdat;
@@ -422,61 +430,3 @@ void __init page_address_init(void)
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-
-void debug_kmap_atomic(enum km_type type)
-{
- static int warn_count = 10;
-
- if (unlikely(warn_count < 0))
- return;
-
- if (unlikely(in_interrupt())) {
- if (in_nmi()) {
- if (type != KM_NMI && type != KM_NMI_PTE) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (in_irq()) {
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
- type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (!irqs_disabled()) { /* softirq */
- if (type != KM_IRQ0 && type != KM_IRQ1 &&
- type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
- type != KM_SKB_SUNRPC_DATA &&
- type != KM_SKB_DATA_SOFTIRQ &&
- type != KM_BOUNCE_READ) {
- WARN_ON(1);
- warn_count--;
- }
- }
- }
-
- if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
- type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
- type == KM_IRQ_PTE || type == KM_NMI ||
- type == KM_NMI_PTE ) {
- if (!irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
- if (irq_count() == 0 && !irqs_disabled()) {
- WARN_ON(1);
- warn_count--;
- }
- }
-#ifdef CONFIG_KGDB_KDB
- if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
- WARN_ON(1);
- warn_count--;
- }
-#endif /* CONFIG_KGDB_KDB */
-}
-
-#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..c4a3558589ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
}
}
-static void copy_gigantic_page(struct page *dst, struct page *src,
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
struct hstate *h = hstate_vma(vma);
struct page *dst_base = dst;
struct page *src_base = src;
- might_sleep();
+
for (i = 0; i < pages_per_huge_page(h); ) {
cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
src = mem_map_next(src, src_base, i);
}
}
-static void copy_huge_page(struct page *dst, struct page *src,
+
+static void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma)
{
int i;
struct hstate *h = hstate_vma(vma);
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
- copy_gigantic_page(dst, src, addr, vma);
+ copy_user_gigantic_page(dst, src, addr, vma);
return;
}
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
}
}
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page(h); ) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_huge_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
+}
+
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++;
}
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ if (list_empty(&h->hugepage_freelists[nid]))
+ return NULL;
+ page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+ list_del(&page->lru);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve)
{
- int nid;
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
- nid = zone_to_nid(zone);
- if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
- !list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
-
- if (!avoid_reserve)
- decrement_hugepage_resv_vma(h, vma);
-
- break;
+ if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+ page = dequeue_huge_page_node(h, zone_to_nid(zone));
+ if (page) {
+ if (!avoid_reserve)
+ decrement_hugepage_resv_vma(h, vma);
+ break;
+ }
}
}
err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
return ret;
}
-static struct page *alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
{
struct page *page;
- unsigned int nid;
+ unsigned int r_nid;
if (h->order >= MAX_ORDER)
return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
}
spin_unlock(&hugetlb_lock);
- page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
- __GFP_REPEAT|__GFP_NOWARN,
- huge_page_order(h));
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+ __GFP_REPEAT|__GFP_NOWARN,
+ huge_page_order(h));
+ else
+ page = alloc_pages_exact_node(nid,
+ htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+ __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
spin_lock(&hugetlb_lock);
if (page) {
- /*
- * This page is now managed by the hugetlb allocator and has
- * no users -- drop the buddy allocator's reference.
- */
- put_page_testzero(page);
- VM_BUG_ON(page_count(page));
- nid = page_to_nid(page);
+ r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
/*
* We incremented the global counters already
*/
- h->nr_huge_pages_node[nid]++;
- h->surplus_huge_pages_node[nid]++;
+ h->nr_huge_pages_node[r_nid]++;
+ h->surplus_huge_pages_node[r_nid]++;
__count_vm_event(HTLB_BUDDY_PGALLOC);
} else {
h->nr_huge_pages--;
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
}
/*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ spin_lock(&hugetlb_lock);
+ page = dequeue_huge_page_node(h, nid);
+ spin_unlock(&hugetlb_lock);
+
+ if (!page)
+ page = alloc_buddy_huge_page(h, nid);
+
+ return page;
+}
+
+/*
* Increase the hugetlb pool such that it can accomodate a reservation
* of size 'delta'.
*/
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_buddy_huge_page(h, NULL, 0);
- if (!page) {
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+ if (!page)
/*
* We were not able to allocate enough pages to
* satisfy the entire reservation so we free what
* we've allocated so far.
*/
- spin_lock(&hugetlb_lock);
- needed = 0;
goto free;
- }
list_add(&page->lru, &surplus_list);
}
@@ -908,31 +964,31 @@ retry:
needed += allocated;
h->resv_huge_pages += delta;
ret = 0;
-free:
+
+ spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
list_del(&page->lru);
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the buddy allocator's reference.
+ */
+ put_page_testzero(page);
+ VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page);
}
/* Free unnecessary surplus pages to the buddy allocator */
+free:
if (!list_empty(&surplus_list)) {
- spin_unlock(&hugetlb_lock);
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
list_del(&page->lru);
- /*
- * The page has a reference count of zero already, so
- * call free_huge_page directly instead of using
- * put_page. This must be done with hugetlb_lock
- * unlocked which is safe because free_huge_page takes
- * hugetlb_lock before deciding how to free the page.
- */
- free_huge_page(page);
+ put_page(page);
}
- spin_lock(&hugetlb_lock);
}
+ spin_lock(&hugetlb_lock);
return ret;
}
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock);
if (!page) {
- page = alloc_buddy_huge_page(h, vma, addr);
+ page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
hugetlb_put_quota(inode->i_mapping, chg);
return ERR_PTR(-VM_FAULT_SIGBUS);
}
}
- set_page_refcounted(page);
set_page_private(page, (unsigned long) mapping);
vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
return -ENOMEM;
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
static int is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
@@ -2380,10 +2448,13 @@ retry_avoidcopy:
* When the original hugepage is shared one, it does not have
* anon_vma prepared.
*/
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
+ }
- copy_huge_page(new_page, old_page, address, vma);
+ copy_user_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
/*
@@ -2515,22 +2586,20 @@ retry:
hugepage_add_new_anon_rmap(page, vma, address);
}
} else {
+ /*
+ * If memory error occurs between mmap() and fault, some process
+ * don't have hwpoisoned swap entry for errored virtual address.
+ * So we need to block hugepage fault by PG_hwpoison bit check.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON |
+ VM_FAULT_SET_HINDEX(h - hstates);
+ goto backout_unlocked;
+ }
page_dup_rmap(page);
}
/*
- * Since memory error handler replaces pte into hwpoison swap entry
- * at the time of error handling, a process which reserved but not have
- * the mapping to the error hugepage does not have hwpoison swap entry.
- * So we need to block accesses from such a process by checking
- * PG_hwpoison bit here.
- */
- if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON;
- goto backout_unlocked;
- }
-
- /*
* If we are going to COW a private mapping later, we examine the
* pending reservations for this page now. This will ensure that
* any allocations necessary to record that reservation occur outside
@@ -2587,8 +2656,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (ptep) {
entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON;
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ migration_entry_wait(mm, (pmd_t *)ptep, address);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(h - hstates);
}
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2951,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed));
}
+#ifdef CONFIG_MEMORY_FAILURE
+
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+ struct page *page;
+ struct page *tmp;
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+ if (page == hpage)
+ return 1;
+ return 0;
+}
+
/*
* This function is called from memory failure code.
* Assume the caller holds page lock of the head page.
*/
-void __isolate_hwpoisoned_huge_page(struct page *hpage)
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
{
struct hstate *h = page_hstate(hpage);
int nid = page_to_nid(hpage);
+ int ret = -EBUSY;
spin_lock(&hugetlb_lock);
- list_del(&hpage->lru);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
+ if (is_hugepage_on_freelist(hpage)) {
+ list_del(&hpage->lru);
+ set_page_refcounted(hpage);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ ret = 0;
+ }
spin_unlock(&hugetlb_lock);
+ return ret;
}
+#endif
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..dedb0aff673f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
*/
static inline unsigned long page_order(struct page *page)
{
- VM_BUG_ON(!PageBuddy(page));
+ /* PageBuddy() must be checked by the caller */
return page_private(page);
}
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd7..e2b6f5634e0d 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
/*
* Access kernel memory without faulting.
*/
-#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/uaccess.h>
/**
* probe_kernel_read(): safely attempt to read from a location
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..2efa8ea07ff7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,10 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
- MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */
+ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
+ /* incremented at every pagein/pageout */
+ MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
+ MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
MEM_CGROUP_STAT_NSTATS,
};
@@ -254,6 +257,12 @@ struct mem_cgroup {
* percpu counter.
*/
struct mem_cgroup_stat_cpu *stat;
+ /*
+ * used when a cpu is offlined or other synchronizations
+ * See mem_cgroup_read_stat().
+ */
+ struct mem_cgroup_stat_cpu nocpu_base;
+ spinlock_t pcp_counter_lock;
};
/* Stuffs for move charges at task migration. */
@@ -530,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
return mz;
}
+/*
+ * Implementation Note: reading percpu statistics for memcg.
+ *
+ * Both of vmstat[] and percpu_counter has threshold and do periodic
+ * synchronization to implement "quick" read. There are trade-off between
+ * reading cost and precision of value. Then, we may have a chance to implement
+ * a periodic synchronizion of counter in memcg's counter.
+ *
+ * But this _read() function is used for user interface now. The user accounts
+ * memory usage by memory cgroup and he _always_ requires exact value because
+ * he accounts memory. Even if we provide quick-and-fuzzy read, we always
+ * have to visit all online cpus and make sum. So, for now, unnecessary
+ * synchronization is not implemented. (just implemented for cpu hotplug)
+ *
+ * If there are kernel internal actions which can make use of some not-exact
+ * value, and reading all cpu value can be performance bottleneck in some
+ * common workload, threashold and synchonization as vmstat[] should be
+ * implemented.
+ */
static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
enum mem_cgroup_stat_index idx)
{
int cpu;
s64 val = 0;
- for_each_possible_cpu(cpu)
+ get_online_cpus();
+ for_each_online_cpu(cpu)
val += per_cpu(mem->stat->count[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ spin_lock(&mem->pcp_counter_lock);
+ val += mem->nocpu_base.count[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+#endif
+ put_online_cpus();
return val;
}
@@ -659,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
return mem;
}
-/*
- * Call callback function against all cgroup under hierarchy tree.
- */
-static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
- int (*func)(struct mem_cgroup *, void *))
+/* The caller has to guarantee "mem" exists before calling this */
+static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
{
- int found, ret, nextid;
struct cgroup_subsys_state *css;
- struct mem_cgroup *mem;
-
- if (!root->use_hierarchy)
- return (*func)(root, data);
+ int found;
- nextid = 1;
- do {
- ret = 0;
+ if (!mem) /* ROOT cgroup has the smallest ID */
+ return root_mem_cgroup; /*css_put/get against root is ignored*/
+ if (!mem->use_hierarchy) {
+ if (css_tryget(&mem->css))
+ return mem;
+ return NULL;
+ }
+ rcu_read_lock();
+ /*
+ * searching a memory cgroup which has the smallest ID under given
+ * ROOT cgroup. (ID >= 1)
+ */
+ css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
+ if (css && css_tryget(css))
+ mem = container_of(css, struct mem_cgroup, css);
+ else
mem = NULL;
+ rcu_read_unlock();
+ return mem;
+}
+
+static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
+ struct mem_cgroup *root,
+ bool cond)
+{
+ int nextid = css_id(&iter->css) + 1;
+ int found;
+ int hierarchy_used;
+ struct cgroup_subsys_state *css;
+
+ hierarchy_used = iter->use_hierarchy;
+
+ css_put(&iter->css);
+ /* If no ROOT, walk all, ignore hierarchy */
+ if (!cond || (root && !hierarchy_used))
+ return NULL;
+ if (!root)
+ root = root_mem_cgroup;
+
+ do {
+ iter = NULL;
rcu_read_lock();
- css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
- &found);
+
+ css = css_get_next(&mem_cgroup_subsys, nextid,
+ &root->css, &found);
if (css && css_tryget(css))
- mem = container_of(css, struct mem_cgroup, css);
+ iter = container_of(css, struct mem_cgroup, css);
rcu_read_unlock();
-
- if (mem) {
- ret = (*func)(mem, data);
- css_put(&mem->css);
- }
+ /* If css is NULL, no more cgroups will be found */
nextid = found + 1;
- } while (!ret && css);
+ } while (css && !iter);
- return ret;
+ return iter;
}
+/*
+ * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
+ * be careful that "break" loop is not allowed. We have reference count.
+ * Instead of that modify "cond" to be false and "continue" to exit the loop.
+ */
+#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
+ for (iter = mem_cgroup_start_loop(root);\
+ iter != NULL;\
+ iter = mem_cgroup_get_next(iter, root, cond))
+
+#define for_each_mem_cgroup_tree(iter, root) \
+ for_each_mem_cgroup_tree_cond(iter, root, true)
+
+#define for_each_mem_cgroup_all(iter) \
+ for_each_mem_cgroup_tree_cond(iter, NULL, true)
+
static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
{
@@ -1051,7 +1129,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}
-/* A routine for testing mem is not under move_account */
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
+{
+ int cpu;
+
+ get_online_cpus();
+ spin_lock(&mem->pcp_counter_lock);
+ for_each_online_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+ mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+ spin_unlock(&mem->pcp_counter_lock);
+ put_online_cpus();
+
+ synchronize_rcu();
+}
+
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+ int cpu;
+
+ if (!mem)
+ return;
+ get_online_cpus();
+ spin_lock(&mem->pcp_counter_lock);
+ for_each_online_cpu(cpu)
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+ mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+ spin_unlock(&mem->pcp_counter_lock);
+ put_online_cpus();
+}
+/*
+ * 2 routines for checking "mem" is under move_account() or not.
+ *
+ * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
+ * for avoiding race in accounting. If true,
+ * pc->mem_cgroup may be overwritten.
+ *
+ * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
+ * under hierarchy of moving cgroups. This is for
+ * waiting at hith-memory prressure caused by "move".
+ */
+
+static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+ return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
@@ -1092,13 +1215,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
return false;
}
-static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
-{
- int *val = data;
- (*val)++;
- return 0;
-}
-
/**
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
* @memcg: The memory cgroup that went over limit
@@ -1173,7 +1289,10 @@ done:
static int mem_cgroup_count_children(struct mem_cgroup *mem)
{
int num = 0;
- mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ num++;
return num;
}
@@ -1322,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
return total;
}
-static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
-{
- int *val = (int *)data;
- int x;
- /*
- * Logically, we can stop scanning immediately when we find
- * a memcg is already locked. But condidering unlock ops and
- * creation/removal of memcg, scan-all is simple operation.
- */
- x = atomic_inc_return(&mem->oom_lock);
- *val = max(x, *val);
- return 0;
-}
/*
* Check OOM-Killer is already running under our hierarchy.
* If someone is running, return false.
*/
static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
{
- int lock_count = 0;
+ int x, lock_count = 0;
+ struct mem_cgroup *iter;
- mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
+ for_each_mem_cgroup_tree(iter, mem) {
+ x = atomic_inc_return(&iter->oom_lock);
+ lock_count = max(x, lock_count);
+ }
if (lock_count == 1)
return true;
return false;
}
-static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
{
+ struct mem_cgroup *iter;
+
/*
* When a new child is created while the hierarchy is under oom,
* mem_cgroup_oom_lock() may not be called. We have to use
* atomic_add_unless() here.
*/
- atomic_add_unless(&mem->oom_lock, -1, 0);
+ for_each_mem_cgroup_tree(iter, mem)
+ atomic_add_unless(&iter->oom_lock, -1, 0);
return 0;
}
-static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
-{
- mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
-}
static DEFINE_MUTEX(memcg_oom_mutex);
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
/*
* Currently used to update mapped file statistics, but the routine can be
* generalized to update other statistics as well.
+ *
+ * Notes: Race condition
+ *
+ * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * it tends to be costly. But considering some conditions, we doesn't need
+ * to do so _always_.
+ *
+ * Considering "charge", lock_page_cgroup() is not required because all
+ * file-stat operations happen after a page is attached to radix-tree. There
+ * are no race with "charge".
+ *
+ * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
+ * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
+ * if there are race with "uncharge". Statistics itself is properly handled
+ * by flags.
+ *
+ * Considering "move", this is an only case we see a race. To make the race
+ * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
+ * possibility of race condition. If there is, we take a lock.
*/
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+
+static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
{
struct mem_cgroup *mem;
- struct page_cgroup *pc;
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ bool need_unlock = false;
- pc = lookup_page_cgroup(page);
if (unlikely(!pc))
return;
- lock_page_cgroup(pc);
+ rcu_read_lock();
mem = pc->mem_cgroup;
- if (!mem || !PageCgroupUsed(pc))
- goto done;
+ if (unlikely(!mem || !PageCgroupUsed(pc)))
+ goto out;
+ /* pc->mem_cgroup is unstable ? */
+ if (unlikely(mem_cgroup_stealed(mem))) {
+ /* take a lock against to access pc->mem_cgroup */
+ lock_page_cgroup(pc);
+ need_unlock = true;
+ mem = pc->mem_cgroup;
+ if (!mem || !PageCgroupUsed(pc))
+ goto out;
+ }
- /*
- * Preemption is already disabled. We can use __this_cpu_xxx
- */
- if (val > 0) {
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- SetPageCgroupFileMapped(pc);
- } else {
- __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
- ClearPageCgroupFileMapped(pc);
+ this_cpu_add(mem->stat->count[idx], val);
+
+ switch (idx) {
+ case MEM_CGROUP_STAT_FILE_MAPPED:
+ if (val > 0)
+ SetPageCgroupFileMapped(pc);
+ else if (!page_mapped(page))
+ ClearPageCgroupFileMapped(pc);
+ break;
+ default:
+ BUG();
}
-done:
- unlock_page_cgroup(pc);
+out:
+ if (unlikely(need_unlock))
+ unlock_page_cgroup(pc);
+ rcu_read_unlock();
+ return;
+}
+
+void mem_cgroup_update_file_mapped(struct page *page, int val)
+{
+ mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
}
/*
@@ -1605,15 +1753,55 @@ static void drain_all_stock_sync(void)
atomic_dec(&memcg_drain_count);
}
-static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
+/*
+ * This function drains percpu counter value from DEAD cpu and
+ * move it to local cpu. Note that this function can be preempted.
+ */
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+{
+ int i;
+
+ spin_lock(&mem->pcp_counter_lock);
+ for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+ s64 x = per_cpu(mem->stat->count[i], cpu);
+
+ per_cpu(mem->stat->count[i], cpu) = 0;
+ mem->nocpu_base.count[i] += x;
+ }
+ /* need to clear ON_MOVE value, works as a kind of lock. */
+ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+ spin_unlock(&mem->pcp_counter_lock);
+}
+
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+{
+ int idx = MEM_CGROUP_ON_MOVE;
+
+ spin_lock(&mem->pcp_counter_lock);
+ per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+}
+
+static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
+ struct mem_cgroup *iter;
- if (action != CPU_DEAD)
+ if ((action == CPU_ONLINE)) {
+ for_each_mem_cgroup_all(iter)
+ synchronize_mem_cgroup_on_move(iter, cpu);
return NOTIFY_OK;
+ }
+
+ if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
+ return NOTIFY_OK;
+
+ for_each_mem_cgroup_all(iter)
+ mem_cgroup_drain_pcp_counter(iter, cpu);
+
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
@@ -3038,6 +3226,7 @@ move_account:
lru_add_drain_all();
drain_all_stock_sync();
ret = 0;
+ mem_cgroup_start_move(mem);
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
enum lru_list l;
@@ -3051,6 +3240,7 @@ move_account:
if (ret)
break;
}
+ mem_cgroup_end_move(mem);
memcg_oom_recover(mem);
/* it seems parent cgroup doesn't have enough mem */
if (ret == -ENOMEM)
@@ -3137,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
return retval;
}
-struct mem_cgroup_idx_data {
- s64 val;
- enum mem_cgroup_stat_index idx;
-};
-static int
-mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
+static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
- struct mem_cgroup_idx_data *d = data;
- d->val += mem_cgroup_read_stat(mem, d->idx);
- return 0;
-}
+ struct mem_cgroup *iter;
+ s64 val = 0;
-static void
-mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx, s64 *val)
-{
- struct mem_cgroup_idx_data d;
- d.idx = idx;
- d.val = 0;
- mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
- *val = d.val;
+ /* each per cpu's value can be minus.Then, use s64 */
+ for_each_mem_cgroup_tree(iter, mem)
+ val += mem_cgroup_read_stat(iter, idx);
+
+ if (val < 0) /* race ? */
+ val = 0;
+ return val;
}
static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
{
- u64 idx_val, val;
+ u64 val;
if (!mem_cgroup_is_root(mem)) {
if (!swap)
@@ -3172,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
return res_counter_read_u64(&mem->memsw, RES_USAGE);
}
- mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
- val = idx_val;
- mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
- val += idx_val;
+ val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
- if (swap) {
- mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_SWAPOUT, &idx_val);
- val += idx_val;
- }
+ if (swap)
+ val += mem_cgroup_get_recursive_idx_stat(mem,
+ MEM_CGROUP_STAT_SWAPOUT);
return val << PAGE_SHIFT;
}
@@ -3389,9 +3567,9 @@ struct {
};
-static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
+static void
+mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
- struct mcs_total_stat *s = data;
s64 val;
/* per cpu stat */
@@ -3421,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
- return 0;
}
static void
mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
{
- mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ mem_cgroup_get_local_stat(iter, s);
}
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3604,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b)
return _a->threshold - _b->threshold;
}
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
{
struct mem_cgroup_eventfd_list *ev;
@@ -3615,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
{
- mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, mem)
+ mem_cgroup_oom_notify_cb(iter);
}
static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -4025,14 +4208,17 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memset(mem, 0, size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!mem->stat) {
- if (size < PAGE_SIZE)
- kfree(mem);
- else
- vfree(mem);
- mem = NULL;
- }
+ if (!mem->stat)
+ goto out_free;
+ spin_lock_init(&mem->pcp_counter_lock);
return mem;
+
+out_free:
+ if (size < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+ return NULL;
}
/*
@@ -4158,7 +4344,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
&per_cpu(memcg_stock, cpu);
INIT_WORK(&stock->work, drain_local_stock);
}
- hotcpu_notifier(memcg_stock_cpu_callback, 0);
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
} else {
parent = mem_cgroup_from_cont(cont->parent);
mem->use_hierarchy = parent->use_hierarchy;
@@ -4513,6 +4699,7 @@ static void mem_cgroup_clear_mc(void)
mc.to = NULL;
mc.moving_task = NULL;
spin_unlock(&mc.lock);
+ mem_cgroup_end_move(from);
memcg_oom_recover(from);
memcg_oom_recover(to);
wake_up_all(&mc.waitq);
@@ -4543,6 +4730,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..124324134ff6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
* Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * The operation to map back from RMAP chains to processes has to walk
- * the complete process list and has non linear complexity with the number
- * mappings. In short it can be quite slow. But since memory corruptions
- * are rare we hope to get away with this.
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
*/
/*
@@ -30,7 +35,6 @@
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel
*/
-#define DEBUG 1 /* remove me in 2.6.34 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
return 0;
/*
- * page_mapping() does not accept slab page
+ * page_mapping() does not accept slab pages.
*/
if (PageSlab(p))
return -EINVAL;
@@ -268,7 +272,7 @@ struct to_kill {
struct list_head nd;
struct task_struct *tsk;
unsigned long addr;
- unsigned addr_valid:1;
+ char addr_valid;
};
/*
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_debug("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("MCE: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
ret = RECOVERED;
}
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* Issues:
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- * migration.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
+ int res = 0;
struct page *hpage = compound_head(p);
/*
* We can safely recover from error on free or reserved (i.e.
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* so there is no race between isolation and mapping/unmapping.
*/
if (!(page_mapping(hpage) || PageAnon(hpage))) {
- __isolate_hwpoisoned_huge_page(hpage);
- return RECOVERED;
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ if (!res)
+ return RECOVERED;
}
return DELAYED;
}
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
}
-#define N_UNMAP_TRIES 5
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int i;
int kill = 1;
struct page *hpage = compound_head(p);
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill);
- /*
- * try_to_unmap can fail temporarily due to races.
- * Try a few times (RED-PEN better strategy?)
- */
- for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(hpage, ttu);
- if (ret == SWAP_SUCCESS)
- break;
- pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
- }
-
+ ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's part of a non-compound high order page.
+ * 2) it's a free hugepage, which is also safe:
+ * an affected hugepage will be dequeued from hugepage freelist,
+ * so there's no concern about reusing it ever after.
+ * 3) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
+ } else if (PageHuge(hpage)) {
+ /*
+ * Check "just unpoisoned", "filter hit", and
+ * "race with other subpage."
+ */
+ lock_page_nosync(hpage);
+ if (!PageHWPoison(hpage)
+ || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ return 0;
+ }
+ set_page_hwpoison_huge_page(hpage);
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ action_result(pfn, "free huge",
+ res ? IGNORED : DELAYED);
+ unlock_page(hpage);
+ return res;
} else {
action_result(pfn, "high order kernel", IGNORED);
return -EBUSY;
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+ pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
return 0;
}
nr_pages = 1 << compound_order(page);
if (!get_page_unless_zero(page)) {
+ /*
+ * Since HWPoisoned hugepage should have non-zero refcount,
+ * race between memory failure and unpoison seems to happen.
+ * In such case unpoison fails and memory failure runs
+ * to the end.
+ */
+ if (PageHuge(page)) {
+ pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ return 0;
+ }
if (TestClearPageHWPoison(p))
atomic_long_sub(nr_pages, &mce_bad_pages);
- pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
* the free buddy page pool.
*/
if (TestClearPageHWPoison(page)) {
- pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
+ pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
}
- if (PageHuge(p))
- clear_page_hwpoison_huge_page(page);
unlock_page(page);
put_page(page);
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ nid);
+ else
+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
* was free.
*/
set_migratetype_isolate(p);
+ /*
+ * When the target page is a free hugepage, just remove it
+ * from free hugepage list.
+ */
if (!get_page_unless_zero(compound_head(p))) {
- if (is_free_buddy_page(p)) {
- pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+ if (PageHuge(p)) {
+ pr_info("get_any_page: %#lx free huge page\n", pfn);
+ ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+ } else if (is_free_buddy_page(p)) {
+ pr_info("get_any_page: %#lx free buddy page\n", pfn);
/* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p);
ret = 0;
} else {
- pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+ pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
pfn, p->flags);
ret = -EIO;
}
@@ -1235,6 +1268,46 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return ret;
}
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ LIST_HEAD(pagelist);
+
+ ret = get_any_page(page, pfn, flags);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ goto done;
+
+ if (PageHWPoison(hpage)) {
+ put_page(hpage);
+ pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ return -EBUSY;
+ }
+
+ /* Keep page count to indicate a given hugepage is isolated. */
+
+ list_add(&hpage->lru, &pagelist);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ if (ret) {
+ putback_lru_pages(&pagelist);
+ pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ if (ret > 0)
+ ret = -EIO;
+ return ret;
+ }
+done:
+ if (!PageHWPoison(hpage))
+ atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ /* keep elevated page count for bad page */
+ return ret;
+}
+
/**
* soft_offline_page - Soft offline a page.
* @page: page to offline
@@ -1262,6 +1335,9 @@ int soft_offline_page(struct page *page, int flags)
int ret;
unsigned long pfn = page_to_pfn(page);
+ if (PageHuge(page))
+ return soft_offline_huge_page(page, flags);
+
ret = get_any_page(page, pfn, flags);
if (ret < 0)
return ret;
@@ -1288,7 +1364,7 @@ int soft_offline_page(struct page *page, int flags)
goto done;
}
if (!PageLRU(page)) {
- pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
}
@@ -1302,7 +1378,7 @@ int soft_offline_page(struct page *page, int flags)
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
- pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
@@ -1323,7 +1399,7 @@ int soft_offline_page(struct page *page, int flags)
put_page(page);
if (ret == 1) {
ret = 0;
- pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+ pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done;
}
@@ -1339,13 +1415,13 @@ int soft_offline_page(struct page *page, int flags)
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
if (ret) {
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+ pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
pfn, ret, page_count(page), page->flags);
}
if (ret)
diff --git a/mm/memory.c b/mm/memory.c
index 98b58fecedef..02e48aa0ed13 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -736,7 +736,7 @@ again:
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte)
return -ENOMEM;
- src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
orig_src_pte = src_pte;
@@ -767,7 +767,7 @@ again:
arch_leave_lazy_mmu_mode();
spin_unlock(src_ptl);
- pte_unmap_nested(orig_src_pte);
+ pte_unmap(orig_src_pte);
add_mm_rss_vec(dst_mm, rss);
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
if (ret &
- (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
+ (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
+ VM_FAULT_SIGBUS))
return i ? i : -EFAULT;
BUG();
}
@@ -1590,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr)
}
#endif /* CONFIG_ELF_CORE */
-pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
pgd_t * pgd = pgd_offset(mm, addr);
@@ -2079,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
- memset(kaddr, 0, PAGE_SIZE);
+ clear_page(kaddr);
kunmap_atomic(kaddr, KM_USER0);
flush_dcache_page(dst);
} else
@@ -2107,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
{
struct page *old_page, *new_page;
pte_t entry;
@@ -2626,6 +2628,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page, *swapcache = NULL;
swp_entry_t entry;
pte_t pte;
+ int locked;
struct mem_cgroup *ptr = NULL;
int exclusive = 0;
int ret = 0;
@@ -2676,8 +2679,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_release;
}
- lock_page(page);
+ locked = lock_page_or_retry(page, mm, flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ if (!locked) {
+ ret |= VM_FAULT_RETRY;
+ goto out_release;
+ }
/*
* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2926,7 +2933,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+ VM_FAULT_RETRY)))
return ret;
if (unlikely(PageHWPoison(vmf.page))) {
@@ -3343,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr)
#endif /* __HAVE_ARCH_GATE_AREA */
-static int follow_pte(struct mm_struct *mm, unsigned long address,
+static int __follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
@@ -3380,6 +3388,17 @@ out:
return -EINVAL;
}
+static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
+{
+ int res;
+
+ /* (void) is needed to make gcc happy */
+ (void) __cond_lock(*ptlp,
+ !(res = __follow_pte(mm, address, ptepp, ptlp)));
+ return res;
+}
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d4e940a26945..9260314a221e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -602,27 +602,14 @@ static struct page *next_active_pageblock(struct page *page)
/* Checks if this range of memory is likely to be hot-removable. */
int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
- int type;
struct page *page = pfn_to_page(start_pfn);
struct page *end_page = page + nr_pages;
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
- type = get_pageblock_migratetype(page);
-
- /*
- * A pageblock containing MOVABLE or free pages is considered
- * removable
- */
- if (type != MIGRATE_MOVABLE && !pageblock_free(page))
- return 0;
-
- /*
- * A pageblock starting with a PageReserved page is not
- * considered removable.
- */
- if (PageReserved(page))
+ if (!is_pageblock_removable_nolock(page))
return 0;
+ cond_resched();
}
/* All pageblocks in the memory block are likely to be hot-removable */
@@ -659,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
* Scanning pfn is much easier than scanning lru list.
* Scan pfn from start to end and Find LRU page.
*/
-int scan_lru_pages(unsigned long start, unsigned long end)
+static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
struct page *page;
@@ -709,29 +696,30 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
page_is_file_cache(page));
} else {
- /* Becasue we don't have big zone->lock. we should
- check this again here. */
- if (page_count(page))
- not_managed++;
#ifdef CONFIG_DEBUG_VM
printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
pfn);
dump_page(page);
#endif
+ /* Becasue we don't have big zone->lock. we should
+ check this again here. */
+ if (page_count(page)) {
+ not_managed++;
+ ret = -EBUSY;
+ break;
+ }
}
}
- ret = -EBUSY;
- if (not_managed) {
- if (!list_empty(&source))
+ if (!list_empty(&source)) {
+ if (not_managed) {
+ putback_lru_pages(&source);
+ goto out;
+ }
+ /* this function returns # of failed pages */
+ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+ if (ret)
putback_lru_pages(&source);
- goto out;
}
- ret = 0;
- if (list_empty(&source))
- goto out;
- /* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
-
out:
return ret;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..4a57f135b76e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -924,15 +924,21 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct vm_area_struct *vma;
nodes_clear(nmask);
node_set(source, nmask);
- check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
- if (!list_empty(&pagelist))
+ if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest, 0);
+ if (err)
+ putback_lru_pages(&pagelist);
+ }
return err;
}
@@ -1147,9 +1153,12 @@ static long do_mbind(unsigned long start, unsigned long len,
err = mbind_range(mm, start, end, new);
- if (!list_empty(&pagelist))
+ if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma, 0);
+ if (nr_failed)
+ putback_lru_pages(&pagelist);
+ }
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
@@ -1588,7 +1597,7 @@ unsigned slab_node(struct mempolicy *policy)
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
- return zone->node;
+ return zone ? zone->node : numa_node_id();
}
default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..fe5a3c6a5426 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/memcontrol.h>
#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
#include <linux/gfp.h>
#include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte_t *ptep, pte;
spinlock_t *ptl;
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
- goto out;
+ if (unlikely(PageHuge(new))) {
+ ptep = huge_pte_offset(mm, addr);
+ if (!ptep)
+ goto out;
+ ptl = &mm->page_table_lock;
+ } else {
+ pgd = pgd_offset(mm, addr);
+ if (!pgd_present(*pgd))
+ goto out;
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- goto out;
+ pud = pud_offset(pgd, addr);
+ if (!pud_present(*pud))
+ goto out;
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- goto out;
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ goto out;
- ptep = pte_offset_map(pmd, addr);
+ ptep = pte_offset_map(pmd, addr);
- if (!is_swap_pte(*ptep)) {
- pte_unmap(ptep);
- goto out;
- }
+ if (!is_swap_pte(*ptep)) {
+ pte_unmap(ptep);
+ goto out;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+ }
- ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(new))
+ pte = pte_mkhuge(pte);
+#endif
flush_cache_page(vma, addr, pte_pfn(pte));
set_pte_at(mm, addr, ptep, pte);
- if (PageAnon(new))
+ if (PageHuge(new)) {
+ if (PageAnon(new))
+ hugepage_add_anon_rmap(new, vma, addr);
+ else
+ page_dup_rmap(new);
+ } else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr);
else
page_add_file_rmap(new);
@@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
/*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page)
+{
+ int expected_count;
+ void **pslot;
+
+ if (!mapping) {
+ if (page_count(page) != 1)
+ return -EAGAIN;
+ return 0;
+ }
+
+ spin_lock_irq(&mapping->tree_lock);
+
+ pslot = radix_tree_lookup_slot(&mapping->page_tree,
+ page_index(page));
+
+ expected_count = 2 + page_has_private(page);
+ if (page_count(page) != expected_count ||
+ (struct page *)radix_tree_deref_slot(pslot) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ if (!page_freeze_refs(page, expected_count)) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+
+ get_page(newpage);
+
+ radix_tree_replace_slot(pslot, newpage);
+
+ page_unfreeze_refs(page, expected_count);
+
+ __put_page(page);
+
+ spin_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
+/*
* Copy the page to its new location
*/
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
{
- copy_highpage(newpage, page);
+ if (PageHuge(page))
+ copy_huge_page(newpage, page);
+ else
+ copy_highpage(newpage, page);
if (PageError(page))
SetPageError(newpage);
@@ -431,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page)
.nr_to_write = 1,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1
};
int rc;
@@ -724,6 +789,92 @@ move_newpage:
}
/*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+ unsigned long private, struct page *hpage,
+ int force, int offlining)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *new_hpage = get_new_page(hpage, private, &result);
+ int rcu_locked = 0;
+ struct anon_vma *anon_vma = NULL;
+
+ if (!new_hpage)
+ return -ENOMEM;
+
+ rc = -EAGAIN;
+
+ if (!trylock_page(hpage)) {
+ if (!force)
+ goto out;
+ lock_page(hpage);
+ }
+
+ if (PageAnon(hpage)) {
+ rcu_read_lock();
+ rcu_locked = 1;
+
+ if (page_mapped(hpage)) {
+ anon_vma = page_anon_vma(hpage);
+ atomic_inc(&anon_vma->external_refcount);
+ }
+ }
+
+ try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+
+ if (!page_mapped(hpage))
+ rc = move_to_new_page(new_hpage, hpage, 1);
+
+ if (rc)
+ remove_migration_ptes(hpage, hpage);
+
+ if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+ &anon_vma->lock)) {
+ int empty = list_empty(&anon_vma->head);
+ spin_unlock(&anon_vma->lock);
+ if (empty)
+ anon_vma_free(anon_vma);
+ }
+
+ if (rcu_locked)
+ rcu_read_unlock();
+out:
+ unlock_page(hpage);
+
+ if (rc != -EAGAIN) {
+ list_del(&hpage->lru);
+ put_page(hpage);
+ }
+
+ put_page(new_hpage);
+
+ if (result) {
+ if (rc)
+ *result = rc;
+ else
+ *result = page_to_nid(new_hpage);
+ }
+ return rc;
+}
+
+/*
* migrate_pages
*
* The function takes one list of pages to migrate and a function
@@ -732,8 +883,9 @@ move_newpage:
*
* The function returns after 10 attempts or if no pages
* are movable anymore because to has become empty
- * or no retryable pages exist anymore. All pages will be
- * returned to the LRU or freed.
+ * or no retryable pages exist anymore.
+ * Caller should call putback_lru_pages to return pages to the LRU
+ * or free list.
*
* Return: Number of pages not migrated or error code.
*/
@@ -780,7 +932,51 @@ out:
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
- putback_lru_pages(from);
+ if (rc)
+ return rc;
+
+ return nr_failed + retry;
+}
+
+int migrate_huge_pages(struct list_head *from,
+ new_page_t get_new_page, unsigned long private, int offlining)
+{
+ int retry = 1;
+ int nr_failed = 0;
+ int pass = 0;
+ struct page *page;
+ struct page *page2;
+ int rc;
+
+ for (pass = 0; pass < 10 && retry; pass++) {
+ retry = 0;
+
+ list_for_each_entry_safe(page, page2, from, lru) {
+ cond_resched();
+
+ rc = unmap_and_move_huge_page(get_new_page,
+ private, page, pass > 2, offlining);
+
+ switch(rc) {
+ case -ENOMEM:
+ goto out;
+ case -EAGAIN:
+ retry++;
+ break;
+ case 0:
+ break;
+ default:
+ /* Permanent failure */
+ nr_failed++;
+ break;
+ }
+ }
+ }
+ rc = 0;
+out:
+
+ list_for_each_entry_safe(page, page2, from, lru)
+ put_page(page);
if (rc)
return rc;
@@ -841,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
err = -EFAULT;
vma = find_vma(mm, pp->addr);
- if (!vma || !vma_migratable(vma))
+ if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
goto set_status;
page = follow_page(vma, pp->addr, FOLL_GET);
@@ -890,9 +1086,12 @@ set_status:
}
err = 0;
- if (!list_empty(&pagelist))
+ if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
(unsigned long)pm, 0);
+ if (err)
+ putback_lru_pages(&pagelist);
+ }
up_read(&mm->mmap_sem);
return err;
@@ -1005,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
int err = -EFAULT;
vma = find_vma(mm, addr);
- if (!vma)
+ if (!vma || addr < vma->vm_start)
goto set_status;
page = follow_page(vma, addr, 0);
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..b179abb1474a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
+#include <linux/audit.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -1108,6 +1109,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long retval = -EBADF;
if (!(flags & MAP_ANONYMOUS)) {
+ audit_mmap_fd(fd, flags);
if (unlikely(flags & MAP_HUGETLB))
return -EINVAL;
file = fget(fd);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2d1bf7cf8851..4c5133873097 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -211,6 +211,7 @@ success:
mmu_notifier_invalidate_range_end(mm, start, end);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ perf_event_mmap(vma);
return 0;
fail:
@@ -299,7 +300,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
- perf_event_mmap(vma);
nstart = tmp;
if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..563fbdd6293a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
* pte locks because exclusive mmap_sem prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
- new_pte = pte_offset_map_nested(new_pmd, new_addr);
+ new_pte = pte_offset_map(new_pmd, new_addr);
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
arch_leave_lazy_mmu_mode();
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
- pte_unmap_nested(new_pte - 1);
+ pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..3613517c7592 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/audit.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -293,11 +294,58 @@ void *vmalloc(unsigned long size)
}
EXPORT_SYMBOL(vmalloc);
+/*
+ * vzalloc - allocate virtually continguos memory with zero fill
+ *
+ * @size: allocation size
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into continguos kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
void *vmalloc_node(unsigned long size, int node)
{
return vmalloc(size);
}
-EXPORT_SYMBOL(vmalloc_node);
+
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return vzalloc(size);
+}
+EXPORT_SYMBOL(vzalloc_node);
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
@@ -1411,6 +1459,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
struct file *file = NULL;
unsigned long retval = -EBADF;
+ audit_mmap_fd(fd, flags);
if (!(flags & MAP_ANONYMOUS)) {
file = fget(fd);
if (!file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..7dcca55ede7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
return 0;
/*
- * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
- * need to be executed for something that cannot be killed.
+ * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
+ * so the entire heuristic doesn't need to be executed for something
+ * that cannot be killed.
*/
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ if (atomic_read(&p->mm->oom_disable_count)) {
task_unlock(p);
return 0;
}
@@ -403,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
#define K(x) ((x) << (PAGE_SHIFT-10))
static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
{
+ struct task_struct *q;
+ struct mm_struct *mm;
+
p = find_lock_task_mm(p);
if (!p)
return 1;
+ /* mm cannot be safely dereferenced after task_unlock(p) */
+ mm = p->mm;
+
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(p), p->comm, K(p->mm->total_vm),
K(get_mm_counter(p->mm, MM_ANONPAGES)),
K(get_mm_counter(p->mm, MM_FILEPAGES)));
task_unlock(p);
+ /*
+ * Kill all processes sharing p->mm in other thread groups, if any.
+ * They don't get access to memory reserves or a higher scheduler
+ * priority, though, to avoid depletion of all memory or task
+ * starvation. This prevents mm->mmap_sem livelock when an oom killed
+ * task cannot exit because it requires the semaphore and its contended
+ * by another thread trying to allocate memory itself. That thread will
+ * now get access to memory reserves since it has a pending fatal
+ * signal.
+ */
+ for_each_process(q)
+ if (q->mm == mm && !same_thread_group(q, p)) {
+ task_lock(q); /* Protect ->comm from prctl() */
+ pr_err("Kill process %d (%s) sharing same memory\n",
+ task_pid_nr(q), q->comm);
+ task_unlock(q);
+ force_sig(SIGKILL, q);
+ }
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
@@ -680,7 +705,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
- (current->signal->oom_adj != OOM_DISABLE)) {
+ current->mm && !atomic_read(&current->mm->oom_disable_count)) {
/*
* oom_kill_process() needs tasklist_lock held. If it returns
* non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..b840afa89761 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
- else {
- int dirty_ratio;
-
- dirty_ratio = vm_dirty_ratio;
- if (dirty_ratio < 5)
- dirty_ratio = 5;
- dirty = (dirty_ratio * available_memory) / 100;
- }
+ else
+ dirty = (vm_dirty_ratio * available_memory) / 100;
if (dirty_background_bytes)
background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping,
* catch-up. This avoids (excessively) small writeouts
* when the bdi limits are ramping up.
*/
- if (nr_reclaimable + nr_writeback <
+ if (nr_reclaimable + nr_writeback <=
(background_thresh + dirty_thresh) / 2)
break;
@@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping,
* the last resort safeguard.
*/
dirty_exceeded =
- (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
- || (nr_reclaimable + nr_writeback >= dirty_thresh);
+ (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
+ || (nr_reclaimable + nr_writeback > dirty_thresh);
if (!dirty_exceeded)
break;
@@ -1121,6 +1115,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
{
if (mapping_cap_account_dirty(mapping)) {
__inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_zone_page_state(page, NR_DIRTIED);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
task_dirty_inc(current);
task_io_account_write(PAGE_CACHE_SIZE);
@@ -1129,6 +1124,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
+ * Helper function for set_page_writeback family.
+ * NOTE: Unlike account_page_dirtied this does not rely on being atomic
+ * wrt interrupts.
+ */
+void account_page_writeback(struct page *page)
+{
+ inc_zone_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITTEN);
+}
+EXPORT_SYMBOL(account_page_writeback);
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
@@ -1366,7 +1373,7 @@ int test_set_page_writeback(struct page *page)
ret = TestSetPageWriteback(page);
}
if (!ret)
- inc_zone_page_state(page, NR_WRITEBACK);
+ account_page_writeback(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2a362c52fdf4..07a654486f75 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -531,7 +531,7 @@ static inline void __free_one_page(struct page *page,
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
struct page *higher_page, *higher_buddy;
combined_idx = __find_combined_index(page_idx, order);
higher_page = page + combined_idx - page_idx;
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
@@ -1932,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
const gfp_t wait = gfp_mask & __GFP_WAIT;
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
- BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
/*
* The caller may dip into page reserves a bit more if the caller
@@ -1940,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
*/
- alloc_flags |= (gfp_mask & __GFP_HIGH);
+ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
if (!wait) {
alloc_flags |= ALLOC_HARDER;
@@ -2095,7 +2095,7 @@ rebalance:
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
}
@@ -5297,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
* page allocater never alloc memory from ISOLATE block.
*/
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+ unsigned long pfn, iter, found;
+ /*
+ * For avoiding noise data, lru_add_drain_all() should be called
+ * If ZONE_MOVABLE, the zone never contains immobile pages
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ return true;
+
+ if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+ return true;
+
+ pfn = page_to_pfn(page);
+ for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+ unsigned long check = pfn + iter;
+
+ if (!pfn_valid_within(check)) {
+ iter++;
+ continue;
+ }
+ page = pfn_to_page(check);
+ if (!page_count(page)) {
+ if (PageBuddy(page))
+ iter += (1 << page_order(page)) - 1;
+ continue;
+ }
+ if (!PageLRU(page))
+ found++;
+ /*
+ * If there are RECLAIMABLE pages, we need to check it.
+ * But now, memory offline itself doesn't call shrink_slab()
+ * and it still to be fixed.
+ */
+ /*
+ * If the page is not RAM, page_count()should be 0.
+ * we don't need more check. This is an _used_ not-movable page.
+ *
+ * The problematic thing here is PG_reserved pages. PG_reserved
+ * is set to both of a memory hole page and a _used_ kernel
+ * page at boot.
+ */
+ if (found > count)
+ return false;
+ }
+ return true;
+}
+
+bool is_pageblock_removable_nolock(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ return __count_immobile_pages(zone, page, 0);
+}
+
int set_migratetype_isolate(struct page *page)
{
struct zone *zone;
- struct page *curr_page;
- unsigned long flags, pfn, iter;
- unsigned long immobile = 0;
+ unsigned long flags, pfn;
struct memory_isolate_notify arg;
int notifier_ret;
int ret = -EBUSY;
@@ -5312,11 +5365,6 @@ int set_migratetype_isolate(struct page *page)
zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
- zone_idx == ZONE_MOVABLE) {
- ret = 0;
- goto out;
- }
pfn = page_to_pfn(page);
arg.start_pfn = pfn;
@@ -5336,23 +5384,20 @@ int set_migratetype_isolate(struct page *page)
*/
notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
notifier_ret = notifier_to_errno(notifier_ret);
- if (notifier_ret || !arg.pages_found)
+ if (notifier_ret)
goto out;
-
- for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
- if (!pfn_valid_within(pfn))
- continue;
-
- curr_page = pfn_to_page(iter);
- if (!page_count(curr_page) || PageLRU(curr_page))
- continue;
-
- immobile++;
- }
-
- if (arg.pages_found == immobile)
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+ * We just check MOVABLE pages.
+ */
+ if (__count_immobile_pages(zone, page, arg.pages_found))
ret = 0;
+ /*
+ * immobile means "not-on-lru" paes. If immobile is larger than
+ * removable-by-driver pages reported by notifier, we'll fail.
+ */
+
out:
if (!ret) {
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
* all pages in [start_pfn...end_pfn) must be in the same zone.
* zone->lock must be held before call this.
*
- * Returns 0 if all pages in the range is isolated.
+ * Returns 1 if all pages in the range is isolated.
*/
static int
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
struct zone *zone;
int ret;
- pfn = start_pfn;
/*
* Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
* is not aligned to pageblock_nr_pages.
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e6757f196e..1a8bf76bfd03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
}
-void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}
@@ -314,7 +314,7 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is
* tricky: page_lock_anon_vma rely on RCU to guard against the races.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *__page_lock_anon_vma(struct page *page)
{
struct anon_vma *anon_vma, *root_anon_vma;
unsigned long anon_mapping;
@@ -348,6 +348,8 @@ out:
}
void page_unlock_anon_vma(struct anon_vma *anon_vma)
+ __releases(&anon_vma->root->lock)
+ __releases(RCU)
{
anon_vma_unlock(anon_vma);
rcu_read_unlock();
@@ -407,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
*
* On success returns with pte mapped and locked.
*/
-pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
unsigned long address, spinlock_t **ptlp, int sync)
{
pgd_t *pgd;
@@ -745,7 +747,7 @@ int page_mkclean(struct page *page)
if (mapping) {
ret = page_mkclean_file(mapping, page);
if (page_test_dirty(page)) {
- page_clear_dirty(page);
+ page_clear_dirty(page, 1);
ret = 1;
}
}
@@ -780,10 +782,10 @@ void page_move_anon_rmap(struct page *page,
}
/**
- * __page_set_anon_rmap - setup new anonymous rmap
- * @page: the page to add the mapping to
- * @vma: the vm area in which the mapping is added
- * @address: the user virtual address mapped
+ * __page_set_anon_rmap - set up new anonymous rmap
+ * @page: Page to add to rmap
+ * @vma: VM area to add page to.
+ * @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
*/
static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +795,16 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
+ if (PageAnon(page))
+ return;
+
/*
* If the page isn't exclusively mapped into this vma,
* we must use the _oldest_ possible anon_vma for the
* page mapping!
*/
- if (!exclusive) {
- if (PageAnon(page))
- return;
+ if (!exclusive)
anon_vma = anon_vma->root;
- } else {
- /*
- * In this case, swapped-out-but-not-discarded swap-cache
- * is remapped. So, no need to update page->mapping here.
- * We convice anon_vma poitned by page->mapping is not obsolete
- * because vma->anon_vma is necessary to be a family of it.
- */
- if (PageAnon(page))
- return;
- }
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
@@ -942,7 +935,7 @@ void page_remove_rmap(struct page *page)
* containing the swap entry, but page not yet written to swap.
*/
if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
- page_clear_dirty(page);
+ page_clear_dirty(page, 1);
set_page_dirty(page);
}
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a57a8f..47fdeeb9d636 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1586,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1903,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
inc_nlink(inode);
- atomic_inc(&inode->i_count); /* New dentry reference */
+ ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
d_instantiate(dentry, inode);
out:
@@ -2146,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
if (*len < 3)
return 255;
- if (hlist_unhashed(&inode->i_hash)) {
+ if (inode_unhashed(inode)) {
/* Unfortunately insert_inode_hash is not idempotent,
* so as we hash inodes here rather than at creation
* time, we need a lock to ensure we only try
@@ -2154,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
*/
static DEFINE_SPINLOCK(lock);
spin_lock(&lock);
- if (hlist_unhashed(&inode->i_hash))
+ if (inode_unhashed(inode))
__insert_inode_hash(inode,
inode->i_ino + inode->i_generation);
spin_unlock(&lock);
@@ -2537,16 +2538,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
};
-static int shmem_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static struct dentry *shmem_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
{
- return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+ return mount_nodev(fs_type, flags, data, shmem_fill_super);
}
static struct file_system_type tmpfs_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
- .get_sb = shmem_get_sb,
+ .mount = shmem_mount,
.kill_sb = kill_litter_super,
};
@@ -2642,7 +2643,7 @@ out:
static struct file_system_type tmpfs_fs_type = {
.name = "tmpfs",
- .get_sb = ramfs_get_sb,
+ .mount = ramfs_mount,
.kill_sb = kill_litter_super,
};
diff --git a/mm/slab.c b/mm/slab.c
index fcae9815d3b3..b1e40dafbab3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to,
struct array_cache *from, unsigned int max)
{
/* Figure out how many entries to transfer */
- int nr = min(min(from->avail, max), to->limit - to->avail);
+ int nr = min3(from->avail, max, to->limit - to->avail);
if (!nr)
return 0;
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..3f4854205b16 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold)
pagevec_free(&pages_to_free);
}
+EXPORT_SYMBOL(release_pages);
/*
* The pages which we're about to release may be in the deferred lru-addition
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9fc7bac7db0c..67ddaaf98c74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -30,6 +30,7 @@
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
+#include <linux/poll.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
+/* Activity counter to indicate that a swapon or swapoff has occurred */
+static atomic_t proc_poll_event = ATOMIC_INIT(0);
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
@@ -1680,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
}
filp_close(swap_file, NULL);
err = 0;
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
out_dput:
filp_close(victim, NULL);
@@ -1688,6 +1695,25 @@ out:
}
#ifdef CONFIG_PROC_FS
+struct proc_swaps {
+ struct seq_file seq;
+ int event;
+};
+
+static unsigned swaps_poll(struct file *file, poll_table *wait)
+{
+ struct proc_swaps *s = file->private_data;
+
+ poll_wait(file, &proc_poll_wait, wait);
+
+ if (s->event != atomic_read(&proc_poll_event)) {
+ s->event = atomic_read(&proc_poll_event);
+ return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+ }
+
+ return POLLIN | POLLRDNORM;
+}
+
/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
@@ -1771,7 +1797,24 @@ static const struct seq_operations swaps_op = {
static int swaps_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &swaps_op);
+ struct proc_swaps *s;
+ int ret;
+
+ s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ file->private_data = s;
+
+ ret = seq_open(file, &swaps_op);
+ if (ret) {
+ kfree(s);
+ return ret;
+ }
+
+ s->seq.private = s;
+ s->event = atomic_read(&proc_poll_event);
+ return ret;
}
static const struct file_operations proc_swaps_operations = {
@@ -1779,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
+ .poll = swaps_poll,
};
static int __init procswaps_init(void)
@@ -2084,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
swap_info[prev]->next = type;
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
+ atomic_inc(&proc_poll_event);
+ wake_up_interruptible(&proc_poll_wait);
+
error = 0;
goto out;
bad_swap:
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9f909622a25e..a3d66b3dc5cb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -293,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va)
struct rb_node *tmp;
while (*p) {
- struct vmap_area *tmp;
+ struct vmap_area *tmp_va;
parent = *p;
- tmp = rb_entry(parent, struct vmap_area, rb_node);
- if (va->va_start < tmp->va_end)
+ tmp_va = rb_entry(parent, struct vmap_area, rb_node);
+ if (va->va_start < tmp_va->va_end)
p = &(*p)->rb_left;
- else if (va->va_end > tmp->va_start)
+ else if (va->va_end > tmp_va->va_start)
p = &(*p)->rb_right;
else
BUG();
@@ -1596,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
+static inline void *__vmalloc_node_flags(unsigned long size,
+ int node, gfp_t flags)
+{
+ return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+ node, __builtin_return_address(0));
+}
+
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
@@ -1607,12 +1614,28 @@ EXPORT_SYMBOL(__vmalloc);
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
- -1, __builtin_return_address(0));
+ return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
}
EXPORT_SYMBOL(vmalloc);
/**
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+ return __vmalloc_node_flags(size, -1,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc);
+
+/**
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
* @size: allocation size
*
@@ -1653,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node)
}
EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size: allocation size
+ * @node: numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc_node() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+ return __vmalloc_node_flags(size, node,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_node);
+
#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif
@@ -2350,6 +2392,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
+ __acquires(&vmlist_lock)
{
loff_t n = *pos;
struct vm_struct *v;
@@ -2376,6 +2419,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
}
static void s_stop(struct seq_file *m, void *p)
+ __releases(&vmlist_lock)
{
read_unlock(&vmlist_lock);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94c9464f262..d31d7ce52c0e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,12 @@
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
+enum lumpy_mode {
+ LUMPY_MODE_NONE,
+ LUMPY_MODE_ASYNC,
+ LUMPY_MODE_SYNC,
+};
+
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
@@ -82,7 +88,7 @@ struct scan_control {
* Intend to reclaim enough continuous memory rather than reclaim
* enough amount of memory. i.e, mode for high order allocation.
*/
- bool lumpy_reclaim_mode;
+ enum lumpy_mode lumpy_reclaim_mode;
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -265,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
return ret;
}
+static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+ bool sync)
+{
+ enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+
+ /*
+ * Some reclaim have alredy been failed. No worth to try synchronous
+ * lumpy reclaim.
+ */
+ if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+ return;
+
+ /*
+ * If we need a large contiguous chunk of memory, or have
+ * trouble getting a small set of contiguous pages, we
+ * will reclaim both active and inactive pages.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ sc->lumpy_reclaim_mode = mode;
+ else if (sc->order && priority < DEF_PRIORITY - 2)
+ sc->lumpy_reclaim_mode = mode;
+ else
+ sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+}
+
+static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+{
+ sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+}
+
static inline int is_page_cache_freeable(struct page *page)
{
/*
@@ -275,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+ struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
@@ -283,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
return 1;
if (bdi == current->backing_dev_info)
return 1;
+
+ /* lumpy reclaim for hugepage often need a lot of write */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+ return 1;
return 0;
}
@@ -307,12 +348,6 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
-/* Request for sync pageout. */
-enum pageout_io {
- PAGEOUT_IO_ASYNC,
- PAGEOUT_IO_SYNC,
-};
-
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -330,7 +365,7 @@ typedef enum {
* Calls ->writepage().
*/
static pageout_t pageout(struct page *page, struct address_space *mapping,
- enum pageout_io sync_writeback)
+ struct scan_control *sc)
{
/*
* If the page is dirty, only perform writeback if that write
@@ -366,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info))
+ if (!may_write_to_queue(mapping->backing_dev_info, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -376,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
- .nonblocking = 1,
.for_reclaim = 1,
};
@@ -394,7 +428,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* direct reclaiming a large contiguous area and the
* first attempt to free a range of pages fails.
*/
- if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+ if (PageWriteback(page) &&
+ sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
wait_on_page_writeback(page);
if (!PageWriteback(page)) {
@@ -402,7 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page,
- trace_reclaim_flags(page, sync_writeback));
+ trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -580,7 +615,7 @@ static enum page_references page_check_references(struct page *page,
referenced_page = TestClearPageReferenced(page);
/* Lumpy reclaim - ignore references */
- if (sc->lumpy_reclaim_mode)
+ if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
return PAGEREF_RECLAIM;
/*
@@ -616,7 +651,7 @@ static enum page_references page_check_references(struct page *page,
}
/* Reclaim if clean, defer dirty pages to writeback */
- if (referenced_page)
+ if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;
@@ -644,12 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct scan_control *sc,
- enum pageout_io sync_writeback)
+ struct zone *zone,
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
int pgactivate = 0;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
cond_resched();
@@ -669,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep;
VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(page_zone(page) != zone);
sc->nr_scanned++;
@@ -694,10 +732,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* for any page for which writeback has already
* started.
*/
- if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+ if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+ may_enter_fs)
wait_on_page_writeback(page);
- else
- goto keep_locked;
+ else {
+ unlock_page(page);
+ goto keep_lumpy;
+ }
}
references = page_check_references(page, sc);
@@ -743,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
if (PageDirty(page)) {
+ nr_dirty++;
+
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
@@ -751,14 +794,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Page is dirty, try to write it out here */
- switch (pageout(page, mapping, sync_writeback)) {
+ switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
+ nr_congested++;
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- if (PageWriteback(page) || PageDirty(page))
+ if (PageWriteback(page))
+ goto keep_lumpy;
+ if (PageDirty(page))
goto keep;
+
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
@@ -841,6 +888,7 @@ cull_mlocked:
try_to_free_swap(page);
unlock_page(page);
putback_lru_page(page);
+ disable_lumpy_reclaim_mode(sc);
continue;
activate_locked:
@@ -853,10 +901,21 @@ activate_locked:
keep_locked:
unlock_page(page);
keep:
+ disable_lumpy_reclaim_mode(sc);
+keep_lumpy:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
}
+ /*
+ * Tag a zone as congested if all the dirty pages encountered were
+ * backed by a congested BDI. In this case, reclaimers should just
+ * back off and wait for congestion to clear because further reclaim
+ * will encounter the same problem
+ */
+ if (nr_dirty == nr_congested && nr_dirty != 0)
+ zone_set_flag(zone, ZONE_CONGESTED);
+
free_page_list(&free_pages);
list_splice(&ret_pages, page_list);
@@ -1006,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
/* Check that we have not crossed a zone boundary. */
if (unlikely(page_zone_id(cursor_page) != zone_id))
- continue;
+ break;
/*
* If we don't have enough swap space, reclaiming of
@@ -1014,8 +1073,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* pointless.
*/
if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
- !PageSwapCache(cursor_page))
- continue;
+ !PageSwapCache(cursor_page))
+ break;
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
@@ -1026,11 +1085,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
nr_lumpy_dirty++;
scan++;
} else {
- if (mode == ISOLATE_BOTH &&
- page_count(cursor_page))
- nr_lumpy_failed++;
+ /* the page is freed already. */
+ if (!page_count(cursor_page))
+ continue;
+ break;
}
}
+
+ /* If we break out of the loop above, lumpy reclaim failed */
+ if (pfn < end_pfn)
+ nr_lumpy_failed++;
}
*scanned = scan;
@@ -1253,7 +1317,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
return false;
/* Only stall on lumpy reclaim */
- if (!sc->lumpy_reclaim_mode)
+ if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
return false;
/* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1350,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- unsigned long nr_active;
unsigned long nr_anon;
unsigned long nr_file;
@@ -1298,15 +1361,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
return SWAP_CLUSTER_MAX;
}
-
+ set_lumpy_reclaim_mode(priority, sc, false);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(nr_to_scan,
&page_list, &nr_scanned, sc->order,
- sc->lumpy_reclaim_mode ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
+ sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+ ISOLATE_INACTIVE : ISOLATE_BOTH,
zone, 0, file);
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
@@ -1318,8 +1381,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
} else {
nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
&page_list, &nr_scanned, sc->order,
- sc->lumpy_reclaim_mode ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
+ sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
+ ISOLATE_INACTIVE : ISOLATE_BOTH,
zone, sc->mem_cgroup,
0, file);
/*
@@ -1337,20 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
- nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc);
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
-
- /*
- * The attempt at page out may have made some
- * of the pages active, mark them inactive again.
- */
- nr_active = clear_active_flags(&page_list, NULL);
- count_vm_events(PGDEACTIVATE, nr_active);
-
- nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+ set_lumpy_reclaim_mode(priority, sc, true);
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
local_irq_disable();
@@ -1359,6 +1414,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
__count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+
+ trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+ zone_idx(zone),
+ nr_scanned, nr_reclaimed,
+ priority,
+ trace_shrink_flags(file, sc->lumpy_reclaim_mode));
return nr_reclaimed;
}
@@ -1506,6 +1567,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
}
+#ifdef CONFIG_SWAP
static int inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
@@ -1531,12 +1593,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
{
int low;
+ /*
+ * If we don't have swap space, anonymous page deactivation
+ * is pointless.
+ */
+ if (!total_swap_pages)
+ return 0;
+
if (scanning_global_lru(sc))
low = inactive_anon_is_low_global(zone);
else
low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
return low;
}
+#else
+static inline int inactive_anon_is_low(struct zone *zone,
+ struct scan_control *sc)
+{
+ return 0;
+}
+#endif
static int inactive_file_is_low_global(struct zone *zone)
{
@@ -1721,21 +1797,6 @@ out:
}
}
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
-{
- /*
- * If we need a large contiguous chunk of memory, or have
- * trouble getting a small set of contiguous pages, we
- * will reclaim both active and inactive pages.
- */
- if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- sc->lumpy_reclaim_mode = 1;
- else if (sc->order && priority < DEF_PRIORITY - 2)
- sc->lumpy_reclaim_mode = 1;
- else
- sc->lumpy_reclaim_mode = 0;
-}
-
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
@@ -1750,8 +1811,6 @@ static void shrink_zone(int priority, struct zone *zone,
get_scan_count(zone, sc, nr, priority);
- set_lumpy_reclaim_mode(priority, sc);
-
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
@@ -1782,7 +1841,7 @@ static void shrink_zone(int priority, struct zone *zone,
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
+ if (inactive_anon_is_low(zone, sc))
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
throttle_vm_writeout(sc->gfp_mask);
@@ -1937,21 +1996,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
/* Take a nap, wait for some writeback to complete */
if (!sc->hibernation_mode && sc->nr_scanned &&
- priority < DEF_PRIORITY - 2)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ priority < DEF_PRIORITY - 2) {
+ struct zone *preferred_zone;
+
+ first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+ NULL, &preferred_zone);
+ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+ }
}
out:
- /*
- * Now that we've scanned all the zones at this priority level, note
- * that level within the zone so that the next thread which performs
- * scanning of this zone will immediately start out at this priority
- * level. This affects only the decision whether or not to bring
- * mapped pages onto the inactive list.
- */
- if (priority < 0)
- priority = 0;
-
delayacct_freepages_end();
put_mems_allowed();
@@ -2247,6 +2301,15 @@ loop_again:
if (!zone_watermark_ok(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
+ } else {
+ /*
+ * If a zone reaches its high watermark,
+ * consider it to be no longer congested. It's
+ * possible there are dirty pages backed by
+ * congested BDIs but as pressure is relieved,
+ * spectulatively avoid congestion waits
+ */
+ zone_clear_flag(zone, ZONE_CONGESTED);
}
}
@@ -2987,6 +3050,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
return 0;
}
+#ifdef CONFIG_NUMA
/*
* per node 'scan_unevictable_pages' attribute. On demand re-scan of
* a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3097,4 @@ void scan_unevictable_unregister_node(struct node *node)
{
sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
}
-
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..42eac4d33216 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,8 @@
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
+#include <linux/writeback.h>
+#include <linux/compaction.h>
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -394,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
#endif
#ifdef CONFIG_COMPACTION
+
struct contig_page_info {
unsigned long free_pages;
unsigned long free_blocks_total;
@@ -745,6 +748,11 @@ static const char * const vmstat_text[] = {
"nr_isolated_anon",
"nr_isolated_file",
"nr_shmem",
+ "nr_dirtied",
+ "nr_written",
+ "nr_dirty_threshold",
+ "nr_dirty_background_threshold",
+
#ifdef CONFIG_NUMA
"numa_hit",
"numa_miss",
@@ -904,36 +912,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
.release = seq_release,
};
+enum writeback_stat_item {
+ NR_DIRTY_THRESHOLD,
+ NR_DIRTY_BG_THRESHOLD,
+ NR_VM_WRITEBACK_STAT_ITEMS,
+};
+
static void *vmstat_start(struct seq_file *m, loff_t *pos)
{
unsigned long *v;
-#ifdef CONFIG_VM_EVENT_COUNTERS
- unsigned long *e;
-#endif
- int i;
+ int i, stat_items_size;
if (*pos >= ARRAY_SIZE(vmstat_text))
return NULL;
+ stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+ NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
#ifdef CONFIG_VM_EVENT_COUNTERS
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
- + sizeof(struct vm_event_state), GFP_KERNEL);
-#else
- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
- GFP_KERNEL);
+ stat_items_size += sizeof(struct vm_event_state);
#endif
+
+ v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
v[i] = global_page_state(i);
+ v += NR_VM_ZONE_STAT_ITEMS;
+
+ global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+ v + NR_DIRTY_THRESHOLD);
+ v += NR_VM_WRITEBACK_STAT_ITEMS;
+
#ifdef CONFIG_VM_EVENT_COUNTERS
- e = v + NR_VM_ZONE_STAT_ITEMS;
- all_vm_events(e);
- e[PGPGIN] /= 2; /* sectors -> kbytes */
- e[PGPGOUT] /= 2;
+ all_vm_events(v);
+ v[PGPGIN] /= 2; /* sectors -> kbytes */
+ v[PGPGOUT] /= 2;
#endif
- return v + *pos;
+ return (unsigned long *)m->private + *pos;
}
static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)