Root/mm/memory-failure.c

1/*
2 * Copyright (C) 2008, 2009 Intel Corporation
3 * Authors: Andi Kleen, Fengguang Wu
4 *
5 * This software may be redistributed and/or modified under the terms of
6 * the GNU General Public License ("GPL") version 2 only as published by the
7 * Free Software Foundation.
8 *
9 * High level machine check handler. Handles pages reported by the
10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
11 * failure.
12 *
13 * In addition there is a "soft offline" entry point that allows stop using
14 * not-yet-corrupted-by-suspicious pages without killing anything.
15 *
16 * Handles page cache pages in various states. The tricky part
17 * here is that we can access any page asynchronously in respect to
18 * other VM users, because memory failures could happen anytime and
19 * anywhere. This could violate some of their assumptions. This is why
20 * this code has to be extremely careful. Generally it tries to use
21 * normal locking rules, as in get the standard locks, even if that means
22 * the error handling takes potentially a long time.
23 *
24 * There are several operations here with exponential complexity because
25 * of unsuitable VM data structures. For example the operation to map back
26 * from RMAP chains to processes has to walk the complete process list and
27 * has non linear complexity with the number. But since memory corruptions
28 * are rare we hope to get away with this. This avoids impacting the core
29 * VM.
30 */
31
32/*
33 * Notebook:
34 * - hugetlb needs more code
35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
36 * - pass bad pages to kdump next kernel
37 */
38#include <linux/kernel.h>
39#include <linux/mm.h>
40#include <linux/page-flags.h>
41#include <linux/kernel-page-flags.h>
42#include <linux/sched.h>
43#include <linux/ksm.h>
44#include <linux/rmap.h>
45#include <linux/export.h>
46#include <linux/pagemap.h>
47#include <linux/swap.h>
48#include <linux/backing-dev.h>
49#include <linux/migrate.h>
50#include <linux/page-isolation.h>
51#include <linux/suspend.h>
52#include <linux/slab.h>
53#include <linux/swapops.h>
54#include <linux/hugetlb.h>
55#include <linux/memory_hotplug.h>
56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
58#include "internal.h"
59
60int sysctl_memory_failure_early_kill __read_mostly = 0;
61
62int sysctl_memory_failure_recovery __read_mostly = 1;
63
64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67
68u32 hwpoison_filter_enable = 0;
69u32 hwpoison_filter_dev_major = ~0U;
70u32 hwpoison_filter_dev_minor = ~0U;
71u64 hwpoison_filter_flags_mask;
72u64 hwpoison_filter_flags_value;
73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
78
79static int hwpoison_filter_dev(struct page *p)
80{
81    struct address_space *mapping;
82    dev_t dev;
83
84    if (hwpoison_filter_dev_major == ~0U &&
85        hwpoison_filter_dev_minor == ~0U)
86        return 0;
87
88    /*
89     * page_mapping() does not accept slab pages.
90     */
91    if (PageSlab(p))
92        return -EINVAL;
93
94    mapping = page_mapping(p);
95    if (mapping == NULL || mapping->host == NULL)
96        return -EINVAL;
97
98    dev = mapping->host->i_sb->s_dev;
99    if (hwpoison_filter_dev_major != ~0U &&
100        hwpoison_filter_dev_major != MAJOR(dev))
101        return -EINVAL;
102    if (hwpoison_filter_dev_minor != ~0U &&
103        hwpoison_filter_dev_minor != MINOR(dev))
104        return -EINVAL;
105
106    return 0;
107}
108
109static int hwpoison_filter_flags(struct page *p)
110{
111    if (!hwpoison_filter_flags_mask)
112        return 0;
113
114    if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
115                    hwpoison_filter_flags_value)
116        return 0;
117    else
118        return -EINVAL;
119}
120
121/*
122 * This allows stress tests to limit test scope to a collection of tasks
123 * by putting them under some memcg. This prevents killing unrelated/important
124 * processes such as /sbin/init. Note that the target task may share clean
125 * pages with init (eg. libc text), which is harmless. If the target task
126 * share _dirty_ pages with another task B, the test scheme must make sure B
127 * is also included in the memcg. At last, due to race conditions this filter
128 * can only guarantee that the page either belongs to the memcg tasks, or is
129 * a freed page.
130 */
131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p)
135{
136    struct mem_cgroup *mem;
137    struct cgroup_subsys_state *css;
138    unsigned long ino;
139
140    if (!hwpoison_filter_memcg)
141        return 0;
142
143    mem = try_get_mem_cgroup_from_page(p);
144    if (!mem)
145        return -EINVAL;
146
147    css = mem_cgroup_css(mem);
148    /* root_mem_cgroup has NULL dentries */
149    if (!css->cgroup->dentry)
150        return -EINVAL;
151
152    ino = css->cgroup->dentry->d_inode->i_ino;
153    css_put(css);
154
155    if (ino != hwpoison_filter_memcg)
156        return -EINVAL;
157
158    return 0;
159}
160#else
161static int hwpoison_filter_task(struct page *p) { return 0; }
162#endif
163
164int hwpoison_filter(struct page *p)
165{
166    if (!hwpoison_filter_enable)
167        return 0;
168
169    if (hwpoison_filter_dev(p))
170        return -EINVAL;
171
172    if (hwpoison_filter_flags(p))
173        return -EINVAL;
174
175    if (hwpoison_filter_task(p))
176        return -EINVAL;
177
178    return 0;
179}
180#else
181int hwpoison_filter(struct page *p)
182{
183    return 0;
184}
185#endif
186
187EXPORT_SYMBOL_GPL(hwpoison_filter);
188
189/*
190 * Send all the processes who have the page mapped a signal.
191 * ``action optional'' if they are not immediately affected by the error
192 * ``action required'' if error happened in current execution context
193 */
194static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
195            unsigned long pfn, struct page *page, int flags)
196{
197    struct siginfo si;
198    int ret;
199
200    printk(KERN_ERR
201        "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
202        pfn, t->comm, t->pid);
203    si.si_signo = SIGBUS;
204    si.si_errno = 0;
205    si.si_addr = (void *)addr;
206#ifdef __ARCH_SI_TRAPNO
207    si.si_trapno = trapno;
208#endif
209    si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
210
211    if ((flags & MF_ACTION_REQUIRED) && t == current) {
212        si.si_code = BUS_MCEERR_AR;
213        ret = force_sig_info(SIGBUS, &si, t);
214    } else {
215        /*
216         * Don't use force here, it's convenient if the signal
217         * can be temporarily blocked.
218         * This could cause a loop when the user sets SIGBUS
219         * to SIG_IGN, but hopefully no one will do that?
220         */
221        si.si_code = BUS_MCEERR_AO;
222        ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
223    }
224    if (ret < 0)
225        printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
226               t->comm, t->pid, ret);
227    return ret;
228}
229
230/*
231 * When a unknown page type is encountered drain as many buffers as possible
232 * in the hope to turn the page into a LRU or free page, which we can handle.
233 */
234void shake_page(struct page *p, int access)
235{
236    if (!PageSlab(p)) {
237        lru_add_drain_all();
238        if (PageLRU(p))
239            return;
240        drain_all_pages();
241        if (PageLRU(p) || is_free_buddy_page(p))
242            return;
243    }
244
245    /*
246     * Only call shrink_slab here (which would also shrink other caches) if
247     * access is not potentially fatal.
248     */
249    if (access) {
250        int nr;
251        int nid = page_to_nid(p);
252        do {
253            struct shrink_control shrink = {
254                .gfp_mask = GFP_KERNEL,
255            };
256            node_set(nid, shrink.nodes_to_scan);
257
258            nr = shrink_slab(&shrink, 1000, 1000);
259            if (page_count(p) == 1)
260                break;
261        } while (nr > 10);
262    }
263}
264EXPORT_SYMBOL_GPL(shake_page);
265
266/*
267 * Kill all processes that have a poisoned page mapped and then isolate
268 * the page.
269 *
270 * General strategy:
271 * Find all processes having the page mapped and kill them.
272 * But we keep a page reference around so that the page is not
273 * actually freed yet.
274 * Then stash the page away
275 *
276 * There's no convenient way to get back to mapped processes
277 * from the VMAs. So do a brute-force search over all
278 * running processes.
279 *
280 * Remember that machine checks are not common (or rather
281 * if they are common you have other problems), so this shouldn't
282 * be a performance issue.
283 *
284 * Also there are some races possible while we get from the
285 * error detection to actually handle it.
286 */
287
288struct to_kill {
289    struct list_head nd;
290    struct task_struct *tsk;
291    unsigned long addr;
292    char addr_valid;
293};
294
295/*
296 * Failure handling: if we can't find or can't kill a process there's
297 * not much we can do. We just print a message and ignore otherwise.
298 */
299
300/*
301 * Schedule a process for later kill.
302 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
303 * TBD would GFP_NOIO be enough?
304 */
305static void add_to_kill(struct task_struct *tsk, struct page *p,
306               struct vm_area_struct *vma,
307               struct list_head *to_kill,
308               struct to_kill **tkc)
309{
310    struct to_kill *tk;
311
312    if (*tkc) {
313        tk = *tkc;
314        *tkc = NULL;
315    } else {
316        tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
317        if (!tk) {
318            printk(KERN_ERR
319        "MCE: Out of memory while machine check handling\n");
320            return;
321        }
322    }
323    tk->addr = page_address_in_vma(p, vma);
324    tk->addr_valid = 1;
325
326    /*
327     * In theory we don't have to kill when the page was
328     * munmaped. But it could be also a mremap. Since that's
329     * likely very rare kill anyways just out of paranoia, but use
330     * a SIGKILL because the error is not contained anymore.
331     */
332    if (tk->addr == -EFAULT) {
333        pr_info("MCE: Unable to find user space address %lx in %s\n",
334            page_to_pfn(p), tsk->comm);
335        tk->addr_valid = 0;
336    }
337    get_task_struct(tsk);
338    tk->tsk = tsk;
339    list_add_tail(&tk->nd, to_kill);
340}
341
342/*
343 * Kill the processes that have been collected earlier.
344 *
345 * Only do anything when DOIT is set, otherwise just free the list
346 * (this is used for clean pages which do not need killing)
347 * Also when FAIL is set do a force kill because something went
348 * wrong earlier.
349 */
350static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
351              int fail, struct page *page, unsigned long pfn,
352              int flags)
353{
354    struct to_kill *tk, *next;
355
356    list_for_each_entry_safe (tk, next, to_kill, nd) {
357        if (forcekill) {
358            /*
359             * In case something went wrong with munmapping
360             * make sure the process doesn't catch the
361             * signal and then access the memory. Just kill it.
362             */
363            if (fail || tk->addr_valid == 0) {
364                printk(KERN_ERR
365        "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
366                    pfn, tk->tsk->comm, tk->tsk->pid);
367                force_sig(SIGKILL, tk->tsk);
368            }
369
370            /*
371             * In theory the process could have mapped
372             * something else on the address in-between. We could
373             * check for that, but we need to tell the
374             * process anyways.
375             */
376            else if (kill_proc(tk->tsk, tk->addr, trapno,
377                          pfn, page, flags) < 0)
378                printk(KERN_ERR
379        "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
380                    pfn, tk->tsk->comm, tk->tsk->pid);
381        }
382        put_task_struct(tk->tsk);
383        kfree(tk);
384    }
385}
386
387static int task_early_kill(struct task_struct *tsk)
388{
389    if (!tsk->mm)
390        return 0;
391    if (tsk->flags & PF_MCE_PROCESS)
392        return !!(tsk->flags & PF_MCE_EARLY);
393    return sysctl_memory_failure_early_kill;
394}
395
396/*
397 * Collect processes when the error hit an anonymous page.
398 */
399static void collect_procs_anon(struct page *page, struct list_head *to_kill,
400                  struct to_kill **tkc)
401{
402    struct vm_area_struct *vma;
403    struct task_struct *tsk;
404    struct anon_vma *av;
405    pgoff_t pgoff;
406
407    av = page_lock_anon_vma_read(page);
408    if (av == NULL) /* Not actually mapped anymore */
409        return;
410
411    pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
412    read_lock(&tasklist_lock);
413    for_each_process (tsk) {
414        struct anon_vma_chain *vmac;
415
416        if (!task_early_kill(tsk))
417            continue;
418        anon_vma_interval_tree_foreach(vmac, &av->rb_root,
419                           pgoff, pgoff) {
420            vma = vmac->vma;
421            if (!page_mapped_in_vma(page, vma))
422                continue;
423            if (vma->vm_mm == tsk->mm)
424                add_to_kill(tsk, page, vma, to_kill, tkc);
425        }
426    }
427    read_unlock(&tasklist_lock);
428    page_unlock_anon_vma_read(av);
429}
430
431/*
432 * Collect processes when the error hit a file mapped page.
433 */
434static void collect_procs_file(struct page *page, struct list_head *to_kill,
435                  struct to_kill **tkc)
436{
437    struct vm_area_struct *vma;
438    struct task_struct *tsk;
439    struct address_space *mapping = page->mapping;
440
441    mutex_lock(&mapping->i_mmap_mutex);
442    read_lock(&tasklist_lock);
443    for_each_process(tsk) {
444        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
445
446        if (!task_early_kill(tsk))
447            continue;
448
449        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
450                      pgoff) {
451            /*
452             * Send early kill signal to tasks where a vma covers
453             * the page but the corrupted page is not necessarily
454             * mapped it in its pte.
455             * Assume applications who requested early kill want
456             * to be informed of all such data corruptions.
457             */
458            if (vma->vm_mm == tsk->mm)
459                add_to_kill(tsk, page, vma, to_kill, tkc);
460        }
461    }
462    read_unlock(&tasklist_lock);
463    mutex_unlock(&mapping->i_mmap_mutex);
464}
465
466/*
467 * Collect the processes who have the corrupted page mapped to kill.
468 * This is done in two steps for locking reasons.
469 * First preallocate one tokill structure outside the spin locks,
470 * so that we can kill at least one process reasonably reliable.
471 */
472static void collect_procs(struct page *page, struct list_head *tokill)
473{
474    struct to_kill *tk;
475
476    if (!page->mapping)
477        return;
478
479    tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
480    if (!tk)
481        return;
482    if (PageAnon(page))
483        collect_procs_anon(page, tokill, &tk);
484    else
485        collect_procs_file(page, tokill, &tk);
486    kfree(tk);
487}
488
489/*
490 * Error handlers for various types of pages.
491 */
492
493enum outcome {
494    IGNORED, /* Error: cannot be handled */
495    FAILED, /* Error: handling failed */
496    DELAYED, /* Will be handled later */
497    RECOVERED, /* Successfully recovered */
498};
499
500static const char *action_name[] = {
501    [IGNORED] = "Ignored",
502    [FAILED] = "Failed",
503    [DELAYED] = "Delayed",
504    [RECOVERED] = "Recovered",
505};
506
507/*
508 * XXX: It is possible that a page is isolated from LRU cache,
509 * and then kept in swap cache or failed to remove from page cache.
510 * The page count will stop it from being freed by unpoison.
511 * Stress tests should be aware of this memory leak problem.
512 */
513static int delete_from_lru_cache(struct page *p)
514{
515    if (!isolate_lru_page(p)) {
516        /*
517         * Clear sensible page flags, so that the buddy system won't
518         * complain when the page is unpoison-and-freed.
519         */
520        ClearPageActive(p);
521        ClearPageUnevictable(p);
522        /*
523         * drop the page count elevated by isolate_lru_page()
524         */
525        page_cache_release(p);
526        return 0;
527    }
528    return -EIO;
529}
530
531/*
532 * Error hit kernel page.
533 * Do nothing, try to be lucky and not touch this instead. For a few cases we
534 * could be more sophisticated.
535 */
536static int me_kernel(struct page *p, unsigned long pfn)
537{
538    return IGNORED;
539}
540
541/*
542 * Page in unknown state. Do nothing.
543 */
544static int me_unknown(struct page *p, unsigned long pfn)
545{
546    printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
547    return FAILED;
548}
549
550/*
551 * Clean (or cleaned) page cache page.
552 */
553static int me_pagecache_clean(struct page *p, unsigned long pfn)
554{
555    int err;
556    int ret = FAILED;
557    struct address_space *mapping;
558
559    delete_from_lru_cache(p);
560
561    /*
562     * For anonymous pages we're done the only reference left
563     * should be the one m_f() holds.
564     */
565    if (PageAnon(p))
566        return RECOVERED;
567
568    /*
569     * Now truncate the page in the page cache. This is really
570     * more like a "temporary hole punch"
571     * Don't do this for block devices when someone else
572     * has a reference, because it could be file system metadata
573     * and that's not safe to truncate.
574     */
575    mapping = page_mapping(p);
576    if (!mapping) {
577        /*
578         * Page has been teared down in the meanwhile
579         */
580        return FAILED;
581    }
582
583    /*
584     * Truncation is a bit tricky. Enable it per file system for now.
585     *
586     * Open: to take i_mutex or not for this? Right now we don't.
587     */
588    if (mapping->a_ops->error_remove_page) {
589        err = mapping->a_ops->error_remove_page(mapping, p);
590        if (err != 0) {
591            printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
592                    pfn, err);
593        } else if (page_has_private(p) &&
594                !try_to_release_page(p, GFP_NOIO)) {
595            pr_info("MCE %#lx: failed to release buffers\n", pfn);
596        } else {
597            ret = RECOVERED;
598        }
599    } else {
600        /*
601         * If the file system doesn't support it just invalidate
602         * This fails on dirty or anything with private pages
603         */
604        if (invalidate_inode_page(p))
605            ret = RECOVERED;
606        else
607            printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
608                pfn);
609    }
610    return ret;
611}
612
613/*
614 * Dirty cache page page
615 * Issues: when the error hit a hole page the error is not properly
616 * propagated.
617 */
618static int me_pagecache_dirty(struct page *p, unsigned long pfn)
619{
620    struct address_space *mapping = page_mapping(p);
621
622    SetPageError(p);
623    /* TBD: print more information about the file. */
624    if (mapping) {
625        /*
626         * IO error will be reported by write(), fsync(), etc.
627         * who check the mapping.
628         * This way the application knows that something went
629         * wrong with its dirty file data.
630         *
631         * There's one open issue:
632         *
633         * The EIO will be only reported on the next IO
634         * operation and then cleared through the IO map.
635         * Normally Linux has two mechanisms to pass IO error
636         * first through the AS_EIO flag in the address space
637         * and then through the PageError flag in the page.
638         * Since we drop pages on memory failure handling the
639         * only mechanism open to use is through AS_AIO.
640         *
641         * This has the disadvantage that it gets cleared on
642         * the first operation that returns an error, while
643         * the PageError bit is more sticky and only cleared
644         * when the page is reread or dropped. If an
645         * application assumes it will always get error on
646         * fsync, but does other operations on the fd before
647         * and the page is dropped between then the error
648         * will not be properly reported.
649         *
650         * This can already happen even without hwpoisoned
651         * pages: first on metadata IO errors (which only
652         * report through AS_EIO) or when the page is dropped
653         * at the wrong time.
654         *
655         * So right now we assume that the application DTRT on
656         * the first EIO, but we're not worse than other parts
657         * of the kernel.
658         */
659        mapping_set_error(mapping, EIO);
660    }
661
662    return me_pagecache_clean(p, pfn);
663}
664
665/*
666 * Clean and dirty swap cache.
667 *
668 * Dirty swap cache page is tricky to handle. The page could live both in page
669 * cache and swap cache(ie. page is freshly swapped in). So it could be
670 * referenced concurrently by 2 types of PTEs:
671 * normal PTEs and swap PTEs. We try to handle them consistently by calling
672 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
673 * and then
674 * - clear dirty bit to prevent IO
675 * - remove from LRU
676 * - but keep in the swap cache, so that when we return to it on
677 * a later page fault, we know the application is accessing
678 * corrupted data and shall be killed (we installed simple
679 * interception code in do_swap_page to catch it).
680 *
681 * Clean swap cache pages can be directly isolated. A later page fault will
682 * bring in the known good data from disk.
683 */
684static int me_swapcache_dirty(struct page *p, unsigned long pfn)
685{
686    ClearPageDirty(p);
687    /* Trigger EIO in shmem: */
688    ClearPageUptodate(p);
689
690    if (!delete_from_lru_cache(p))
691        return DELAYED;
692    else
693        return FAILED;
694}
695
696static int me_swapcache_clean(struct page *p, unsigned long pfn)
697{
698    delete_from_swap_cache(p);
699
700    if (!delete_from_lru_cache(p))
701        return RECOVERED;
702    else
703        return FAILED;
704}
705
706/*
707 * Huge pages. Needs work.
708 * Issues:
709 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
710 * To narrow down kill region to one page, we need to break up pmd.
711 */
712static int me_huge_page(struct page *p, unsigned long pfn)
713{
714    int res = 0;
715    struct page *hpage = compound_head(p);
716    /*
717     * We can safely recover from error on free or reserved (i.e.
718     * not in-use) hugepage by dequeuing it from freelist.
719     * To check whether a hugepage is in-use or not, we can't use
720     * page->lru because it can be used in other hugepage operations,
721     * such as __unmap_hugepage_range() and gather_surplus_pages().
722     * So instead we use page_mapping() and PageAnon().
723     * We assume that this function is called with page lock held,
724     * so there is no race between isolation and mapping/unmapping.
725     */
726    if (!(page_mapping(hpage) || PageAnon(hpage))) {
727        res = dequeue_hwpoisoned_huge_page(hpage);
728        if (!res)
729            return RECOVERED;
730    }
731    return DELAYED;
732}
733
734/*
735 * Various page states we can handle.
736 *
737 * A page state is defined by its current page->flags bits.
738 * The table matches them in order and calls the right handler.
739 *
740 * This is quite tricky because we can access page at any time
741 * in its live cycle, so all accesses have to be extremely careful.
742 *
743 * This is not complete. More states could be added.
744 * For any missing state don't attempt recovery.
745 */
746
747#define dirty (1UL << PG_dirty)
748#define sc (1UL << PG_swapcache)
749#define unevict (1UL << PG_unevictable)
750#define mlock (1UL << PG_mlocked)
751#define writeback (1UL << PG_writeback)
752#define lru (1UL << PG_lru)
753#define swapbacked (1UL << PG_swapbacked)
754#define head (1UL << PG_head)
755#define tail (1UL << PG_tail)
756#define compound (1UL << PG_compound)
757#define slab (1UL << PG_slab)
758#define reserved (1UL << PG_reserved)
759
760static struct page_state {
761    unsigned long mask;
762    unsigned long res;
763    char *msg;
764    int (*action)(struct page *p, unsigned long pfn);
765} error_states[] = {
766    { reserved, reserved, "reserved kernel", me_kernel },
767    /*
768     * free pages are specially detected outside this table:
769     * PG_buddy pages only make a small fraction of all free pages.
770     */
771
772    /*
773     * Could in theory check if slab page is free or if we can drop
774     * currently unused objects without touching them. But just
775     * treat it as standard kernel for now.
776     */
777    { slab, slab, "kernel slab", me_kernel },
778
779#ifdef CONFIG_PAGEFLAGS_EXTENDED
780    { head, head, "huge", me_huge_page },
781    { tail, tail, "huge", me_huge_page },
782#else
783    { compound, compound, "huge", me_huge_page },
784#endif
785
786    { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
787    { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
788
789    { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
790    { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
791
792    { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
793    { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
794
795    { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
796    { lru|dirty, lru, "clean LRU", me_pagecache_clean },
797
798    /*
799     * Catchall entry: must be at end.
800     */
801    { 0, 0, "unknown page state", me_unknown },
802};
803
804#undef dirty
805#undef sc
806#undef unevict
807#undef mlock
808#undef writeback
809#undef lru
810#undef swapbacked
811#undef head
812#undef tail
813#undef compound
814#undef slab
815#undef reserved
816
817/*
818 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
819 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
820 */
821static void action_result(unsigned long pfn, char *msg, int result)
822{
823    pr_err("MCE %#lx: %s page recovery: %s\n",
824        pfn, msg, action_name[result]);
825}
826
827static int page_action(struct page_state *ps, struct page *p,
828            unsigned long pfn)
829{
830    int result;
831    int count;
832
833    result = ps->action(p, pfn);
834    action_result(pfn, ps->msg, result);
835
836    count = page_count(p) - 1;
837    if (ps->action == me_swapcache_dirty && result == DELAYED)
838        count--;
839    if (count != 0) {
840        printk(KERN_ERR
841               "MCE %#lx: %s page still referenced by %d users\n",
842               pfn, ps->msg, count);
843        result = FAILED;
844    }
845
846    /* Could do more checks here if page looks ok */
847    /*
848     * Could adjust zone counters here to correct for the missing page.
849     */
850
851    return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
852}
853
854/*
855 * Do all that is necessary to remove user space mappings. Unmap
856 * the pages and send SIGBUS to the processes if the data was dirty.
857 */
858static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
859                  int trapno, int flags)
860{
861    enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
862    struct address_space *mapping;
863    LIST_HEAD(tokill);
864    int ret;
865    int kill = 1, forcekill;
866    struct page *hpage = compound_head(p);
867    struct page *ppage;
868
869    if (PageReserved(p) || PageSlab(p))
870        return SWAP_SUCCESS;
871
872    /*
873     * This check implies we don't kill processes if their pages
874     * are in the swap cache early. Those are always late kills.
875     */
876    if (!page_mapped(hpage))
877        return SWAP_SUCCESS;
878
879    if (PageKsm(p))
880        return SWAP_FAIL;
881
882    if (PageSwapCache(p)) {
883        printk(KERN_ERR
884               "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
885        ttu |= TTU_IGNORE_HWPOISON;
886    }
887
888    /*
889     * Propagate the dirty bit from PTEs to struct page first, because we
890     * need this to decide if we should kill or just drop the page.
891     * XXX: the dirty test could be racy: set_page_dirty() may not always
892     * be called inside page lock (it's recommended but not enforced).
893     */
894    mapping = page_mapping(hpage);
895    if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
896        mapping_cap_writeback_dirty(mapping)) {
897        if (page_mkclean(hpage)) {
898            SetPageDirty(hpage);
899        } else {
900            kill = 0;
901            ttu |= TTU_IGNORE_HWPOISON;
902            printk(KERN_INFO
903    "MCE %#lx: corrupted page was clean: dropped without side effects\n",
904                pfn);
905        }
906    }
907
908    /*
909     * ppage: poisoned page
910     * if p is regular page(4k page)
911     * ppage == real poisoned page;
912     * else p is hugetlb or THP, ppage == head page.
913     */
914    ppage = hpage;
915
916    if (PageTransHuge(hpage)) {
917        /*
918         * Verify that this isn't a hugetlbfs head page, the check for
919         * PageAnon is just for avoid tripping a split_huge_page
920         * internal debug check, as split_huge_page refuses to deal with
921         * anything that isn't an anon page. PageAnon can't go away fro
922         * under us because we hold a refcount on the hpage, without a
923         * refcount on the hpage. split_huge_page can't be safely called
924         * in the first place, having a refcount on the tail isn't
925         * enough * to be safe.
926         */
927        if (!PageHuge(hpage) && PageAnon(hpage)) {
928            if (unlikely(split_huge_page(hpage))) {
929                /*
930                 * FIXME: if splitting THP is failed, it is
931                 * better to stop the following operation rather
932                 * than causing panic by unmapping. System might
933                 * survive if the page is freed later.
934                 */
935                printk(KERN_INFO
936                    "MCE %#lx: failed to split THP\n", pfn);
937
938                BUG_ON(!PageHWPoison(p));
939                return SWAP_FAIL;
940            }
941            /* THP is split, so ppage should be the real poisoned page. */
942            ppage = p;
943        }
944    }
945
946    /*
947     * First collect all the processes that have the page
948     * mapped in dirty form. This has to be done before try_to_unmap,
949     * because ttu takes the rmap data structures down.
950     *
951     * Error handling: We ignore errors here because
952     * there's nothing that can be done.
953     */
954    if (kill)
955        collect_procs(ppage, &tokill);
956
957    if (hpage != ppage)
958        lock_page(ppage);
959
960    ret = try_to_unmap(ppage, ttu);
961    if (ret != SWAP_SUCCESS)
962        printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
963                pfn, page_mapcount(ppage));
964
965    if (hpage != ppage)
966        unlock_page(ppage);
967
968    /*
969     * Now that the dirty bit has been propagated to the
970     * struct page and all unmaps done we can decide if
971     * killing is needed or not. Only kill when the page
972     * was dirty or the process is not restartable,
973     * otherwise the tokill list is merely
974     * freed. When there was a problem unmapping earlier
975     * use a more force-full uncatchable kill to prevent
976     * any accesses to the poisoned memory.
977     */
978    forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
979    kill_procs(&tokill, forcekill, trapno,
980              ret != SWAP_SUCCESS, p, pfn, flags);
981
982    return ret;
983}
984
985static void set_page_hwpoison_huge_page(struct page *hpage)
986{
987    int i;
988    int nr_pages = 1 << compound_order(hpage);
989    for (i = 0; i < nr_pages; i++)
990        SetPageHWPoison(hpage + i);
991}
992
993static void clear_page_hwpoison_huge_page(struct page *hpage)
994{
995    int i;
996    int nr_pages = 1 << compound_order(hpage);
997    for (i = 0; i < nr_pages; i++)
998        ClearPageHWPoison(hpage + i);
999}
1000
1001/**
1002 * memory_failure - Handle memory failure of a page.
1003 * @pfn: Page Number of the corrupted page
1004 * @trapno: Trap number reported in the signal to user space.
1005 * @flags: fine tune action taken
1006 *
1007 * This function is called by the low level machine check code
1008 * of an architecture when it detects hardware memory corruption
1009 * of a page. It tries its best to recover, which includes
1010 * dropping pages, killing processes etc.
1011 *
1012 * The function is primarily of use for corruptions that
1013 * happen outside the current execution context (e.g. when
1014 * detected by a background scrubber)
1015 *
1016 * Must run in process context (e.g. a work queue) with interrupts
1017 * enabled and no spinlocks hold.
1018 */
1019int memory_failure(unsigned long pfn, int trapno, int flags)
1020{
1021    struct page_state *ps;
1022    struct page *p;
1023    struct page *hpage;
1024    int res;
1025    unsigned int nr_pages;
1026    unsigned long page_flags;
1027
1028    if (!sysctl_memory_failure_recovery)
1029        panic("Memory failure from trap %d on page %lx", trapno, pfn);
1030
1031    if (!pfn_valid(pfn)) {
1032        printk(KERN_ERR
1033               "MCE %#lx: memory outside kernel control\n",
1034               pfn);
1035        return -ENXIO;
1036    }
1037
1038    p = pfn_to_page(pfn);
1039    hpage = compound_head(p);
1040    if (TestSetPageHWPoison(p)) {
1041        printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1042        return 0;
1043    }
1044
1045    /*
1046     * Currently errors on hugetlbfs pages are measured in hugepage units,
1047     * so nr_pages should be 1 << compound_order. OTOH when errors are on
1048     * transparent hugepages, they are supposed to be split and error
1049     * measurement is done in normal page units. So nr_pages should be one
1050     * in this case.
1051     */
1052    if (PageHuge(p))
1053        nr_pages = 1 << compound_order(hpage);
1054    else /* normal page or thp */
1055        nr_pages = 1;
1056    atomic_long_add(nr_pages, &num_poisoned_pages);
1057
1058    /*
1059     * We need/can do nothing about count=0 pages.
1060     * 1) it's a free page, and therefore in safe hand:
1061     * prep_new_page() will be the gate keeper.
1062     * 2) it's a free hugepage, which is also safe:
1063     * an affected hugepage will be dequeued from hugepage freelist,
1064     * so there's no concern about reusing it ever after.
1065     * 3) it's part of a non-compound high order page.
1066     * Implies some kernel user: cannot stop them from
1067     * R/W the page; let's pray that the page has been
1068     * used and will be freed some time later.
1069     * In fact it's dangerous to directly bump up page count from 0,
1070     * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
1071     */
1072    if (!(flags & MF_COUNT_INCREASED) &&
1073        !get_page_unless_zero(hpage)) {
1074        if (is_free_buddy_page(p)) {
1075            action_result(pfn, "free buddy", DELAYED);
1076            return 0;
1077        } else if (PageHuge(hpage)) {
1078            /*
1079             * Check "just unpoisoned", "filter hit", and
1080             * "race with other subpage."
1081             */
1082            lock_page(hpage);
1083            if (!PageHWPoison(hpage)
1084                || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1085                || (p != hpage && TestSetPageHWPoison(hpage))) {
1086                atomic_long_sub(nr_pages, &num_poisoned_pages);
1087                return 0;
1088            }
1089            set_page_hwpoison_huge_page(hpage);
1090            res = dequeue_hwpoisoned_huge_page(hpage);
1091            action_result(pfn, "free huge",
1092                      res ? IGNORED : DELAYED);
1093            unlock_page(hpage);
1094            return res;
1095        } else {
1096            action_result(pfn, "high order kernel", IGNORED);
1097            return -EBUSY;
1098        }
1099    }
1100
1101    /*
1102     * We ignore non-LRU pages for good reasons.
1103     * - PG_locked is only well defined for LRU pages and a few others
1104     * - to avoid races with __set_page_locked()
1105     * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
1106     * The check (unnecessarily) ignores LRU pages being isolated and
1107     * walked by the page reclaim code, however that's not a big loss.
1108     */
1109    if (!PageHuge(p) && !PageTransTail(p)) {
1110        if (!PageLRU(p))
1111            shake_page(p, 0);
1112        if (!PageLRU(p)) {
1113            /*
1114             * shake_page could have turned it free.
1115             */
1116            if (is_free_buddy_page(p)) {
1117                if (flags & MF_COUNT_INCREASED)
1118                    action_result(pfn, "free buddy", DELAYED);
1119                else
1120                    action_result(pfn, "free buddy, 2nd try", DELAYED);
1121                return 0;
1122            }
1123            action_result(pfn, "non LRU", IGNORED);
1124            put_page(p);
1125            return -EBUSY;
1126        }
1127    }
1128
1129    /*
1130     * Lock the page and wait for writeback to finish.
1131     * It's very difficult to mess with pages currently under IO
1132     * and in many cases impossible, so we just avoid it here.
1133     */
1134    lock_page(hpage);
1135
1136    /*
1137     * We use page flags to determine what action should be taken, but
1138     * the flags can be modified by the error containment action. One
1139     * example is an mlocked page, where PG_mlocked is cleared by
1140     * page_remove_rmap() in try_to_unmap_one(). So to determine page status
1141     * correctly, we save a copy of the page flags at this time.
1142     */
1143    page_flags = p->flags;
1144
1145    /*
1146     * unpoison always clear PG_hwpoison inside page lock
1147     */
1148    if (!PageHWPoison(p)) {
1149        printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1150        res = 0;
1151        goto out;
1152    }
1153    if (hwpoison_filter(p)) {
1154        if (TestClearPageHWPoison(p))
1155            atomic_long_sub(nr_pages, &num_poisoned_pages);
1156        unlock_page(hpage);
1157        put_page(hpage);
1158        return 0;
1159    }
1160
1161    /*
1162     * For error on the tail page, we should set PG_hwpoison
1163     * on the head page to show that the hugepage is hwpoisoned
1164     */
1165    if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1166        action_result(pfn, "hugepage already hardware poisoned",
1167                IGNORED);
1168        unlock_page(hpage);
1169        put_page(hpage);
1170        return 0;
1171    }
1172    /*
1173     * Set PG_hwpoison on all pages in an error hugepage,
1174     * because containment is done in hugepage unit for now.
1175     * Since we have done TestSetPageHWPoison() for the head page with
1176     * page lock held, we can safely set PG_hwpoison bits on tail pages.
1177     */
1178    if (PageHuge(p))
1179        set_page_hwpoison_huge_page(hpage);
1180
1181    wait_on_page_writeback(p);
1182
1183    /*
1184     * Now take care of user space mappings.
1185     * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1186     */
1187    if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
1188        printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1189        res = -EBUSY;
1190        goto out;
1191    }
1192
1193    /*
1194     * Torn down by someone else?
1195     */
1196    if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1197        action_result(pfn, "already truncated LRU", IGNORED);
1198        res = -EBUSY;
1199        goto out;
1200    }
1201
1202    res = -EBUSY;
1203    /*
1204     * The first check uses the current page flags which may not have any
1205     * relevant information. The second check with the saved page flagss is
1206     * carried out only if the first check can't determine the page status.
1207     */
1208    for (ps = error_states;; ps++)
1209        if ((p->flags & ps->mask) == ps->res)
1210            break;
1211
1212    page_flags |= (p->flags & (1UL << PG_dirty));
1213
1214    if (!ps->mask)
1215        for (ps = error_states;; ps++)
1216            if ((page_flags & ps->mask) == ps->res)
1217                break;
1218    res = page_action(ps, p, pfn);
1219out:
1220    unlock_page(hpage);
1221    return res;
1222}
1223EXPORT_SYMBOL_GPL(memory_failure);
1224
1225#define MEMORY_FAILURE_FIFO_ORDER 4
1226#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1227
1228struct memory_failure_entry {
1229    unsigned long pfn;
1230    int trapno;
1231    int flags;
1232};
1233
1234struct memory_failure_cpu {
1235    DECLARE_KFIFO(fifo, struct memory_failure_entry,
1236              MEMORY_FAILURE_FIFO_SIZE);
1237    spinlock_t lock;
1238    struct work_struct work;
1239};
1240
1241static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1242
1243/**
1244 * memory_failure_queue - Schedule handling memory failure of a page.
1245 * @pfn: Page Number of the corrupted page
1246 * @trapno: Trap number reported in the signal to user space.
1247 * @flags: Flags for memory failure handling
1248 *
1249 * This function is called by the low level hardware error handler
1250 * when it detects hardware memory corruption of a page. It schedules
1251 * the recovering of error page, including dropping pages, killing
1252 * processes etc.
1253 *
1254 * The function is primarily of use for corruptions that
1255 * happen outside the current execution context (e.g. when
1256 * detected by a background scrubber)
1257 *
1258 * Can run in IRQ context.
1259 */
1260void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1261{
1262    struct memory_failure_cpu *mf_cpu;
1263    unsigned long proc_flags;
1264    struct memory_failure_entry entry = {
1265        .pfn = pfn,
1266        .trapno = trapno,
1267        .flags = flags,
1268    };
1269
1270    mf_cpu = &get_cpu_var(memory_failure_cpu);
1271    spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1272    if (kfifo_put(&mf_cpu->fifo, &entry))
1273        schedule_work_on(smp_processor_id(), &mf_cpu->work);
1274    else
1275        pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1276               pfn);
1277    spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1278    put_cpu_var(memory_failure_cpu);
1279}
1280EXPORT_SYMBOL_GPL(memory_failure_queue);
1281
1282static void memory_failure_work_func(struct work_struct *work)
1283{
1284    struct memory_failure_cpu *mf_cpu;
1285    struct memory_failure_entry entry = { 0, };
1286    unsigned long proc_flags;
1287    int gotten;
1288
1289    mf_cpu = &__get_cpu_var(memory_failure_cpu);
1290    for (;;) {
1291        spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1292        gotten = kfifo_get(&mf_cpu->fifo, &entry);
1293        spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1294        if (!gotten)
1295            break;
1296        if (entry.flags & MF_SOFT_OFFLINE)
1297            soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1298        else
1299            memory_failure(entry.pfn, entry.trapno, entry.flags);
1300    }
1301}
1302
1303static int __init memory_failure_init(void)
1304{
1305    struct memory_failure_cpu *mf_cpu;
1306    int cpu;
1307
1308    for_each_possible_cpu(cpu) {
1309        mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1310        spin_lock_init(&mf_cpu->lock);
1311        INIT_KFIFO(mf_cpu->fifo);
1312        INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1313    }
1314
1315    return 0;
1316}
1317core_initcall(memory_failure_init);
1318
1319/**
1320 * unpoison_memory - Unpoison a previously poisoned page
1321 * @pfn: Page number of the to be unpoisoned page
1322 *
1323 * Software-unpoison a page that has been poisoned by
1324 * memory_failure() earlier.
1325 *
1326 * This is only done on the software-level, so it only works
1327 * for linux injected failures, not real hardware failures
1328 *
1329 * Returns 0 for success, otherwise -errno.
1330 */
1331int unpoison_memory(unsigned long pfn)
1332{
1333    struct page *page;
1334    struct page *p;
1335    int freeit = 0;
1336    unsigned int nr_pages;
1337
1338    if (!pfn_valid(pfn))
1339        return -ENXIO;
1340
1341    p = pfn_to_page(pfn);
1342    page = compound_head(p);
1343
1344    if (!PageHWPoison(p)) {
1345        pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1346        return 0;
1347    }
1348
1349    /*
1350     * unpoison_memory() can encounter thp only when the thp is being
1351     * worked by memory_failure() and the page lock is not held yet.
1352     * In such case, we yield to memory_failure() and make unpoison fail.
1353     */
1354    if (!PageHuge(page) && PageTransHuge(page)) {
1355        pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1356            return 0;
1357    }
1358
1359    nr_pages = 1 << compound_order(page);
1360
1361    if (!get_page_unless_zero(page)) {
1362        /*
1363         * Since HWPoisoned hugepage should have non-zero refcount,
1364         * race between memory failure and unpoison seems to happen.
1365         * In such case unpoison fails and memory failure runs
1366         * to the end.
1367         */
1368        if (PageHuge(page)) {
1369            pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1370            return 0;
1371        }
1372        if (TestClearPageHWPoison(p))
1373            atomic_long_dec(&num_poisoned_pages);
1374        pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1375        return 0;
1376    }
1377
1378    lock_page(page);
1379    /*
1380     * This test is racy because PG_hwpoison is set outside of page lock.
1381     * That's acceptable because that won't trigger kernel panic. Instead,
1382     * the PG_hwpoison page will be caught and isolated on the entrance to
1383     * the free buddy page pool.
1384     */
1385    if (TestClearPageHWPoison(page)) {
1386        pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1387        atomic_long_sub(nr_pages, &num_poisoned_pages);
1388        freeit = 1;
1389        if (PageHuge(page))
1390            clear_page_hwpoison_huge_page(page);
1391    }
1392    unlock_page(page);
1393
1394    put_page(page);
1395    if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1396        put_page(page);
1397
1398    return 0;
1399}
1400EXPORT_SYMBOL(unpoison_memory);
1401
1402static struct page *new_page(struct page *p, unsigned long private, int **x)
1403{
1404    int nid = page_to_nid(p);
1405    if (PageHuge(p))
1406        return alloc_huge_page_node(page_hstate(compound_head(p)),
1407                           nid);
1408    else
1409        return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1410}
1411
1412/*
1413 * Safely get reference count of an arbitrary page.
1414 * Returns 0 for a free page, -EIO for a zero refcount page
1415 * that is not free, and 1 for any other page type.
1416 * For 1 the page is returned with increased page count, otherwise not.
1417 */
1418static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1419{
1420    int ret;
1421
1422    if (flags & MF_COUNT_INCREASED)
1423        return 1;
1424
1425    /*
1426     * The lock_memory_hotplug prevents a race with memory hotplug.
1427     * This is a big hammer, a better would be nicer.
1428     */
1429    lock_memory_hotplug();
1430
1431    /*
1432     * Isolate the page, so that it doesn't get reallocated if it
1433     * was free. This flag should be kept set until the source page
1434     * is freed and PG_hwpoison on it is set.
1435     */
1436    if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
1437        set_migratetype_isolate(p, true);
1438    /*
1439     * When the target page is a free hugepage, just remove it
1440     * from free hugepage list.
1441     */
1442    if (!get_page_unless_zero(compound_head(p))) {
1443        if (PageHuge(p)) {
1444            pr_info("%s: %#lx free huge page\n", __func__, pfn);
1445            ret = 0;
1446        } else if (is_free_buddy_page(p)) {
1447            pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1448            ret = 0;
1449        } else {
1450            pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1451                __func__, pfn, p->flags);
1452            ret = -EIO;
1453        }
1454    } else {
1455        /* Not a free page */
1456        ret = 1;
1457    }
1458    unlock_memory_hotplug();
1459    return ret;
1460}
1461
1462static int get_any_page(struct page *page, unsigned long pfn, int flags)
1463{
1464    int ret = __get_any_page(page, pfn, flags);
1465
1466    if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1467        /*
1468         * Try to free it.
1469         */
1470        put_page(page);
1471        shake_page(page, 1);
1472
1473        /*
1474         * Did it turn free?
1475         */
1476        ret = __get_any_page(page, pfn, 0);
1477        if (!PageLRU(page)) {
1478            pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1479                pfn, page->flags);
1480            return -EIO;
1481        }
1482    }
1483    return ret;
1484}
1485
1486static int soft_offline_huge_page(struct page *page, int flags)
1487{
1488    int ret;
1489    unsigned long pfn = page_to_pfn(page);
1490    struct page *hpage = compound_head(page);
1491    LIST_HEAD(pagelist);
1492
1493    /*
1494     * This double-check of PageHWPoison is to avoid the race with
1495     * memory_failure(). See also comment in __soft_offline_page().
1496     */
1497    lock_page(hpage);
1498    if (PageHWPoison(hpage)) {
1499        unlock_page(hpage);
1500        put_page(hpage);
1501        pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1502        return -EBUSY;
1503    }
1504    unlock_page(hpage);
1505
1506    /* Keep page count to indicate a given hugepage is isolated. */
1507    list_move(&hpage->lru, &pagelist);
1508    ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1509                MIGRATE_SYNC, MR_MEMORY_FAILURE);
1510    if (ret) {
1511        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1512            pfn, ret, page->flags);
1513        /*
1514         * We know that soft_offline_huge_page() tries to migrate
1515         * only one hugepage pointed to by hpage, so we need not
1516         * run through the pagelist here.
1517         */
1518        putback_active_hugepage(hpage);
1519        if (ret > 0)
1520            ret = -EIO;
1521    } else {
1522        set_page_hwpoison_huge_page(hpage);
1523        dequeue_hwpoisoned_huge_page(hpage);
1524        atomic_long_add(1 << compound_order(hpage),
1525                &num_poisoned_pages);
1526    }
1527    return ret;
1528}
1529
1530static int __soft_offline_page(struct page *page, int flags)
1531{
1532    int ret;
1533    unsigned long pfn = page_to_pfn(page);
1534
1535    /*
1536     * Check PageHWPoison again inside page lock because PageHWPoison
1537     * is set by memory_failure() outside page lock. Note that
1538     * memory_failure() also double-checks PageHWPoison inside page lock,
1539     * so there's no race between soft_offline_page() and memory_failure().
1540     */
1541    lock_page(page);
1542    wait_on_page_writeback(page);
1543    if (PageHWPoison(page)) {
1544        unlock_page(page);
1545        put_page(page);
1546        pr_info("soft offline: %#lx page already poisoned\n", pfn);
1547        return -EBUSY;
1548    }
1549    /*
1550     * Try to invalidate first. This should work for
1551     * non dirty unmapped page cache pages.
1552     */
1553    ret = invalidate_inode_page(page);
1554    unlock_page(page);
1555    /*
1556     * RED-PEN would be better to keep it isolated here, but we
1557     * would need to fix isolation locking first.
1558     */
1559    if (ret == 1) {
1560        put_page(page);
1561        pr_info("soft_offline: %#lx: invalidated\n", pfn);
1562        SetPageHWPoison(page);
1563        atomic_long_inc(&num_poisoned_pages);
1564        return 0;
1565    }
1566
1567    /*
1568     * Simple invalidation didn't work.
1569     * Try to migrate to a new page instead. migrate.c
1570     * handles a large number of cases for us.
1571     */
1572    ret = isolate_lru_page(page);
1573    /*
1574     * Drop page reference which is came from get_any_page()
1575     * successful isolate_lru_page() already took another one.
1576     */
1577    put_page(page);
1578    if (!ret) {
1579        LIST_HEAD(pagelist);
1580        inc_zone_page_state(page, NR_ISOLATED_ANON +
1581                    page_is_file_cache(page));
1582        list_add(&page->lru, &pagelist);
1583        ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1584                    MIGRATE_SYNC, MR_MEMORY_FAILURE);
1585        if (ret) {
1586            putback_lru_pages(&pagelist);
1587            pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1588                pfn, ret, page->flags);
1589            if (ret > 0)
1590                ret = -EIO;
1591        } else {
1592            /*
1593             * After page migration succeeds, the source page can
1594             * be trapped in pagevec and actual freeing is delayed.
1595             * Freeing code works differently based on PG_hwpoison,
1596             * so there's a race. We need to make sure that the
1597             * source page should be freed back to buddy before
1598             * setting PG_hwpoison.
1599             */
1600            if (!is_free_buddy_page(page))
1601                lru_add_drain_all();
1602            if (!is_free_buddy_page(page))
1603                drain_all_pages();
1604            SetPageHWPoison(page);
1605            if (!is_free_buddy_page(page))
1606                pr_info("soft offline: %#lx: page leaked\n",
1607                    pfn);
1608            atomic_long_inc(&num_poisoned_pages);
1609        }
1610    } else {
1611        pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1612            pfn, ret, page_count(page), page->flags);
1613    }
1614    return ret;
1615}
1616
1617/**
1618 * soft_offline_page - Soft offline a page.
1619 * @page: page to offline
1620 * @flags: flags. Same as memory_failure().
1621 *
1622 * Returns 0 on success, otherwise negated errno.
1623 *
1624 * Soft offline a page, by migration or invalidation,
1625 * without killing anything. This is for the case when
1626 * a page is not corrupted yet (so it's still valid to access),
1627 * but has had a number of corrected errors and is better taken
1628 * out.
1629 *
1630 * The actual policy on when to do that is maintained by
1631 * user space.
1632 *
1633 * This should never impact any application or cause data loss,
1634 * however it might take some time.
1635 *
1636 * This is not a 100% solution for all memory, but tries to be
1637 * ``good enough'' for the majority of memory.
1638 */
1639int soft_offline_page(struct page *page, int flags)
1640{
1641    int ret;
1642    unsigned long pfn = page_to_pfn(page);
1643    struct page *hpage = compound_trans_head(page);
1644
1645    if (PageHWPoison(page)) {
1646        pr_info("soft offline: %#lx page already poisoned\n", pfn);
1647        return -EBUSY;
1648    }
1649    if (!PageHuge(page) && PageTransHuge(hpage)) {
1650        if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1651            pr_info("soft offline: %#lx: failed to split THP\n",
1652                pfn);
1653            return -EBUSY;
1654        }
1655    }
1656
1657    ret = get_any_page(page, pfn, flags);
1658    if (ret < 0)
1659        goto unset;
1660    if (ret) { /* for in-use pages */
1661        if (PageHuge(page))
1662            ret = soft_offline_huge_page(page, flags);
1663        else
1664            ret = __soft_offline_page(page, flags);
1665    } else { /* for free pages */
1666        if (PageHuge(page)) {
1667            set_page_hwpoison_huge_page(hpage);
1668            dequeue_hwpoisoned_huge_page(hpage);
1669            atomic_long_add(1 << compound_order(hpage),
1670                    &num_poisoned_pages);
1671        } else {
1672            SetPageHWPoison(page);
1673            atomic_long_inc(&num_poisoned_pages);
1674        }
1675    }
1676unset:
1677    unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1678    return ret;
1679}
1680

Archive Download this file



interactive