Root/mm/migrate.c

1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/swapops.h>
19#include <linux/pagemap.h>
20#include <linux/buffer_head.h>
21#include <linux/mm_inline.h>
22#include <linux/nsproxy.h>
23#include <linux/pagevec.h>
24#include <linux/ksm.h>
25#include <linux/rmap.h>
26#include <linux/topology.h>
27#include <linux/cpu.h>
28#include <linux/cpuset.h>
29#include <linux/writeback.h>
30#include <linux/mempolicy.h>
31#include <linux/vmalloc.h>
32#include <linux/security.h>
33#include <linux/memcontrol.h>
34#include <linux/syscalls.h>
35#include <linux/hugetlb.h>
36#include <linux/gfp.h>
37
38#include <asm/tlbflush.h>
39
40#include "internal.h"
41
42#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
43
44/*
45 * migrate_prep() needs to be called before we start compiling a list of pages
46 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
47 * undesirable, use migrate_prep_local()
48 */
49int migrate_prep(void)
50{
51    /*
52     * Clear the LRU lists so pages can be isolated.
53     * Note that pages may be moved off the LRU after we have
54     * drained them. Those pages will fail to migrate like other
55     * pages that may be busy.
56     */
57    lru_add_drain_all();
58
59    return 0;
60}
61
62/* Do the necessary work of migrate_prep but not if it involves other CPUs */
63int migrate_prep_local(void)
64{
65    lru_add_drain();
66
67    return 0;
68}
69
70/*
71 * Add isolated pages on the list back to the LRU under page lock
72 * to avoid leaking evictable pages back onto unevictable list.
73 */
74void putback_lru_pages(struct list_head *l)
75{
76    struct page *page;
77    struct page *page2;
78
79    list_for_each_entry_safe(page, page2, l, lru) {
80        list_del(&page->lru);
81        dec_zone_page_state(page, NR_ISOLATED_ANON +
82                page_is_file_cache(page));
83        putback_lru_page(page);
84    }
85}
86
87/*
88 * Restore a potential migration pte to a working pte entry
89 */
90static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
91                 unsigned long addr, void *old)
92{
93    struct mm_struct *mm = vma->vm_mm;
94    swp_entry_t entry;
95     pgd_t *pgd;
96     pud_t *pud;
97     pmd_t *pmd;
98    pte_t *ptep, pte;
99     spinlock_t *ptl;
100
101    if (unlikely(PageHuge(new))) {
102        ptep = huge_pte_offset(mm, addr);
103        if (!ptep)
104            goto out;
105        ptl = &mm->page_table_lock;
106    } else {
107        pgd = pgd_offset(mm, addr);
108        if (!pgd_present(*pgd))
109            goto out;
110
111        pud = pud_offset(pgd, addr);
112        if (!pud_present(*pud))
113            goto out;
114
115        pmd = pmd_offset(pud, addr);
116        if (pmd_trans_huge(*pmd))
117            goto out;
118        if (!pmd_present(*pmd))
119            goto out;
120
121        ptep = pte_offset_map(pmd, addr);
122
123        if (!is_swap_pte(*ptep)) {
124            pte_unmap(ptep);
125            goto out;
126        }
127
128        ptl = pte_lockptr(mm, pmd);
129    }
130
131     spin_lock(ptl);
132    pte = *ptep;
133    if (!is_swap_pte(pte))
134        goto unlock;
135
136    entry = pte_to_swp_entry(pte);
137
138    if (!is_migration_entry(entry) ||
139        migration_entry_to_page(entry) != old)
140        goto unlock;
141
142    get_page(new);
143    pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
144    if (is_write_migration_entry(entry))
145        pte = pte_mkwrite(pte);
146#ifdef CONFIG_HUGETLB_PAGE
147    if (PageHuge(new))
148        pte = pte_mkhuge(pte);
149#endif
150    flush_cache_page(vma, addr, pte_pfn(pte));
151    set_pte_at(mm, addr, ptep, pte);
152
153    if (PageHuge(new)) {
154        if (PageAnon(new))
155            hugepage_add_anon_rmap(new, vma, addr);
156        else
157            page_dup_rmap(new);
158    } else if (PageAnon(new))
159        page_add_anon_rmap(new, vma, addr);
160    else
161        page_add_file_rmap(new);
162
163    /* No need to invalidate - it was non-present before */
164    update_mmu_cache(vma, addr, ptep);
165unlock:
166    pte_unmap_unlock(ptep, ptl);
167out:
168    return SWAP_AGAIN;
169}
170
171/*
172 * Get rid of all migration entries and replace them by
173 * references to the indicated page.
174 */
175static void remove_migration_ptes(struct page *old, struct page *new)
176{
177    rmap_walk(new, remove_migration_pte, old);
178}
179
180/*
181 * Something used the pte of a page under migration. We need to
182 * get to the page and wait until migration is finished.
183 * When we return from this function the fault will be retried.
184 *
185 * This function is called from do_swap_page().
186 */
187void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
188                unsigned long address)
189{
190    pte_t *ptep, pte;
191    spinlock_t *ptl;
192    swp_entry_t entry;
193    struct page *page;
194
195    ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
196    pte = *ptep;
197    if (!is_swap_pte(pte))
198        goto out;
199
200    entry = pte_to_swp_entry(pte);
201    if (!is_migration_entry(entry))
202        goto out;
203
204    page = migration_entry_to_page(entry);
205
206    /*
207     * Once radix-tree replacement of page migration started, page_count
208     * *must* be zero. And, we don't want to call wait_on_page_locked()
209     * against a page without get_page().
210     * So, we use get_page_unless_zero(), here. Even failed, page fault
211     * will occur again.
212     */
213    if (!get_page_unless_zero(page))
214        goto out;
215    pte_unmap_unlock(ptep, ptl);
216    wait_on_page_locked(page);
217    put_page(page);
218    return;
219out:
220    pte_unmap_unlock(ptep, ptl);
221}
222
223/*
224 * Replace the page in the mapping.
225 *
226 * The number of remaining references must be:
227 * 1 for anonymous pages without a mapping
228 * 2 for pages with a mapping
229 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
230 */
231static int migrate_page_move_mapping(struct address_space *mapping,
232        struct page *newpage, struct page *page)
233{
234    int expected_count;
235    void **pslot;
236
237    if (!mapping) {
238        /* Anonymous page without mapping */
239        if (page_count(page) != 1)
240            return -EAGAIN;
241        return 0;
242    }
243
244    spin_lock_irq(&mapping->tree_lock);
245
246    pslot = radix_tree_lookup_slot(&mapping->page_tree,
247                     page_index(page));
248
249    expected_count = 2 + page_has_private(page);
250    if (page_count(page) != expected_count ||
251        radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
252        spin_unlock_irq(&mapping->tree_lock);
253        return -EAGAIN;
254    }
255
256    if (!page_freeze_refs(page, expected_count)) {
257        spin_unlock_irq(&mapping->tree_lock);
258        return -EAGAIN;
259    }
260
261    /*
262     * Now we know that no one else is looking at the page.
263     */
264    get_page(newpage); /* add cache reference */
265    if (PageSwapCache(page)) {
266        SetPageSwapCache(newpage);
267        set_page_private(newpage, page_private(page));
268    }
269
270    radix_tree_replace_slot(pslot, newpage);
271
272    page_unfreeze_refs(page, expected_count);
273    /*
274     * Drop cache reference from old page.
275     * We know this isn't the last reference.
276     */
277    __put_page(page);
278
279    /*
280     * If moved to a different zone then also account
281     * the page for that zone. Other VM counters will be
282     * taken care of when we establish references to the
283     * new page and drop references to the old page.
284     *
285     * Note that anonymous pages are accounted for
286     * via NR_FILE_PAGES and NR_ANON_PAGES if they
287     * are mapped to swap space.
288     */
289    __dec_zone_page_state(page, NR_FILE_PAGES);
290    __inc_zone_page_state(newpage, NR_FILE_PAGES);
291    if (!PageSwapCache(page) && PageSwapBacked(page)) {
292        __dec_zone_page_state(page, NR_SHMEM);
293        __inc_zone_page_state(newpage, NR_SHMEM);
294    }
295    spin_unlock_irq(&mapping->tree_lock);
296
297    return 0;
298}
299
300/*
301 * The expected number of remaining references is the same as that
302 * of migrate_page_move_mapping().
303 */
304int migrate_huge_page_move_mapping(struct address_space *mapping,
305                   struct page *newpage, struct page *page)
306{
307    int expected_count;
308    void **pslot;
309
310    if (!mapping) {
311        if (page_count(page) != 1)
312            return -EAGAIN;
313        return 0;
314    }
315
316    spin_lock_irq(&mapping->tree_lock);
317
318    pslot = radix_tree_lookup_slot(&mapping->page_tree,
319                    page_index(page));
320
321    expected_count = 2 + page_has_private(page);
322    if (page_count(page) != expected_count ||
323        radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
324        spin_unlock_irq(&mapping->tree_lock);
325        return -EAGAIN;
326    }
327
328    if (!page_freeze_refs(page, expected_count)) {
329        spin_unlock_irq(&mapping->tree_lock);
330        return -EAGAIN;
331    }
332
333    get_page(newpage);
334
335    radix_tree_replace_slot(pslot, newpage);
336
337    page_unfreeze_refs(page, expected_count);
338
339    __put_page(page);
340
341    spin_unlock_irq(&mapping->tree_lock);
342    return 0;
343}
344
345/*
346 * Copy the page to its new location
347 */
348void migrate_page_copy(struct page *newpage, struct page *page)
349{
350    if (PageHuge(page))
351        copy_huge_page(newpage, page);
352    else
353        copy_highpage(newpage, page);
354
355    if (PageError(page))
356        SetPageError(newpage);
357    if (PageReferenced(page))
358        SetPageReferenced(newpage);
359    if (PageUptodate(page))
360        SetPageUptodate(newpage);
361    if (TestClearPageActive(page)) {
362        VM_BUG_ON(PageUnevictable(page));
363        SetPageActive(newpage);
364    } else if (TestClearPageUnevictable(page))
365        SetPageUnevictable(newpage);
366    if (PageChecked(page))
367        SetPageChecked(newpage);
368    if (PageMappedToDisk(page))
369        SetPageMappedToDisk(newpage);
370
371    if (PageDirty(page)) {
372        clear_page_dirty_for_io(page);
373        /*
374         * Want to mark the page and the radix tree as dirty, and
375         * redo the accounting that clear_page_dirty_for_io undid,
376         * but we can't use set_page_dirty because that function
377         * is actually a signal that all of the page has become dirty.
378         * Whereas only part of our page may be dirty.
379         */
380        __set_page_dirty_nobuffers(newpage);
381     }
382
383    mlock_migrate_page(newpage, page);
384    ksm_migrate_page(newpage, page);
385
386    ClearPageSwapCache(page);
387    ClearPagePrivate(page);
388    set_page_private(page, 0);
389    page->mapping = NULL;
390
391    /*
392     * If any waiters have accumulated on the new page then
393     * wake them up.
394     */
395    if (PageWriteback(newpage))
396        end_page_writeback(newpage);
397}
398
399/************************************************************
400 * Migration functions
401 ***********************************************************/
402
403/* Always fail migration. Used for mappings that are not movable */
404int fail_migrate_page(struct address_space *mapping,
405            struct page *newpage, struct page *page)
406{
407    return -EIO;
408}
409EXPORT_SYMBOL(fail_migrate_page);
410
411/*
412 * Common logic to directly migrate a single page suitable for
413 * pages that do not use PagePrivate/PagePrivate2.
414 *
415 * Pages are locked upon entry and exit.
416 */
417int migrate_page(struct address_space *mapping,
418        struct page *newpage, struct page *page)
419{
420    int rc;
421
422    BUG_ON(PageWriteback(page)); /* Writeback must be complete */
423
424    rc = migrate_page_move_mapping(mapping, newpage, page);
425
426    if (rc)
427        return rc;
428
429    migrate_page_copy(newpage, page);
430    return 0;
431}
432EXPORT_SYMBOL(migrate_page);
433
434#ifdef CONFIG_BLOCK
435/*
436 * Migration function for pages with buffers. This function can only be used
437 * if the underlying filesystem guarantees that no other references to "page"
438 * exist.
439 */
440int buffer_migrate_page(struct address_space *mapping,
441        struct page *newpage, struct page *page)
442{
443    struct buffer_head *bh, *head;
444    int rc;
445
446    if (!page_has_buffers(page))
447        return migrate_page(mapping, newpage, page);
448
449    head = page_buffers(page);
450
451    rc = migrate_page_move_mapping(mapping, newpage, page);
452
453    if (rc)
454        return rc;
455
456    bh = head;
457    do {
458        get_bh(bh);
459        lock_buffer(bh);
460        bh = bh->b_this_page;
461
462    } while (bh != head);
463
464    ClearPagePrivate(page);
465    set_page_private(newpage, page_private(page));
466    set_page_private(page, 0);
467    put_page(page);
468    get_page(newpage);
469
470    bh = head;
471    do {
472        set_bh_page(bh, newpage, bh_offset(bh));
473        bh = bh->b_this_page;
474
475    } while (bh != head);
476
477    SetPagePrivate(newpage);
478
479    migrate_page_copy(newpage, page);
480
481    bh = head;
482    do {
483        unlock_buffer(bh);
484         put_bh(bh);
485        bh = bh->b_this_page;
486
487    } while (bh != head);
488
489    return 0;
490}
491EXPORT_SYMBOL(buffer_migrate_page);
492#endif
493
494/*
495 * Writeback a page to clean the dirty state
496 */
497static int writeout(struct address_space *mapping, struct page *page)
498{
499    struct writeback_control wbc = {
500        .sync_mode = WB_SYNC_NONE,
501        .nr_to_write = 1,
502        .range_start = 0,
503        .range_end = LLONG_MAX,
504        .for_reclaim = 1
505    };
506    int rc;
507
508    if (!mapping->a_ops->writepage)
509        /* No write method for the address space */
510        return -EINVAL;
511
512    if (!clear_page_dirty_for_io(page))
513        /* Someone else already triggered a write */
514        return -EAGAIN;
515
516    /*
517     * A dirty page may imply that the underlying filesystem has
518     * the page on some queue. So the page must be clean for
519     * migration. Writeout may mean we loose the lock and the
520     * page state is no longer what we checked for earlier.
521     * At this point we know that the migration attempt cannot
522     * be successful.
523     */
524    remove_migration_ptes(page, page);
525
526    rc = mapping->a_ops->writepage(page, &wbc);
527
528    if (rc != AOP_WRITEPAGE_ACTIVATE)
529        /* unlocked. Relock */
530        lock_page(page);
531
532    return (rc < 0) ? -EIO : -EAGAIN;
533}
534
535/*
536 * Default handling if a filesystem does not provide a migration function.
537 */
538static int fallback_migrate_page(struct address_space *mapping,
539    struct page *newpage, struct page *page)
540{
541    if (PageDirty(page))
542        return writeout(mapping, page);
543
544    /*
545     * Buffers may be managed in a filesystem specific way.
546     * We must have no buffers or drop them.
547     */
548    if (page_has_private(page) &&
549        !try_to_release_page(page, GFP_KERNEL))
550        return -EAGAIN;
551
552    return migrate_page(mapping, newpage, page);
553}
554
555/*
556 * Move a page to a newly allocated page
557 * The page is locked and all ptes have been successfully removed.
558 *
559 * The new page will have replaced the old page if this function
560 * is successful.
561 *
562 * Return value:
563 * < 0 - error code
564 * == 0 - success
565 */
566static int move_to_new_page(struct page *newpage, struct page *page,
567                    int remap_swapcache, bool sync)
568{
569    struct address_space *mapping;
570    int rc;
571
572    /*
573     * Block others from accessing the page when we get around to
574     * establishing additional references. We are the only one
575     * holding a reference to the new page at this point.
576     */
577    if (!trylock_page(newpage))
578        BUG();
579
580    /* Prepare mapping for the new page.*/
581    newpage->index = page->index;
582    newpage->mapping = page->mapping;
583    if (PageSwapBacked(page))
584        SetPageSwapBacked(newpage);
585
586    mapping = page_mapping(page);
587    if (!mapping)
588        rc = migrate_page(mapping, newpage, page);
589    else {
590        /*
591         * Do not writeback pages if !sync and migratepage is
592         * not pointing to migrate_page() which is nonblocking
593         * (swapcache/tmpfs uses migratepage = migrate_page).
594         */
595        if (PageDirty(page) && !sync &&
596            mapping->a_ops->migratepage != migrate_page)
597            rc = -EBUSY;
598        else if (mapping->a_ops->migratepage)
599            /*
600             * Most pages have a mapping and most filesystems
601             * should provide a migration function. Anonymous
602             * pages are part of swap space which also has its
603             * own migration function. This is the most common
604             * path for page migration.
605             */
606            rc = mapping->a_ops->migratepage(mapping,
607                            newpage, page);
608        else
609            rc = fallback_migrate_page(mapping, newpage, page);
610    }
611
612    if (rc) {
613        newpage->mapping = NULL;
614    } else {
615        if (remap_swapcache)
616            remove_migration_ptes(page, newpage);
617    }
618
619    unlock_page(newpage);
620
621    return rc;
622}
623
624/*
625 * Obtain the lock on page, remove all ptes and migrate the page
626 * to the newly allocated page in newpage.
627 */
628static int unmap_and_move(new_page_t get_new_page, unsigned long private,
629            struct page *page, int force, bool offlining, bool sync)
630{
631    int rc = 0;
632    int *result = NULL;
633    struct page *newpage = get_new_page(page, private, &result);
634    int remap_swapcache = 1;
635    int charge = 0;
636    struct mem_cgroup *mem;
637    struct anon_vma *anon_vma = NULL;
638
639    if (!newpage)
640        return -ENOMEM;
641
642    if (page_count(page) == 1) {
643        /* page was freed from under us. So we are done. */
644        goto move_newpage;
645    }
646    if (unlikely(PageTransHuge(page)))
647        if (unlikely(split_huge_page(page)))
648            goto move_newpage;
649
650    /* prepare cgroup just returns 0 or -ENOMEM */
651    rc = -EAGAIN;
652
653    if (!trylock_page(page)) {
654        if (!force || !sync)
655            goto move_newpage;
656
657        /*
658         * It's not safe for direct compaction to call lock_page.
659         * For example, during page readahead pages are added locked
660         * to the LRU. Later, when the IO completes the pages are
661         * marked uptodate and unlocked. However, the queueing
662         * could be merging multiple pages for one bio (e.g.
663         * mpage_readpages). If an allocation happens for the
664         * second or third page, the process can end up locking
665         * the same page twice and deadlocking. Rather than
666         * trying to be clever about what pages can be locked,
667         * avoid the use of lock_page for direct compaction
668         * altogether.
669         */
670        if (current->flags & PF_MEMALLOC)
671            goto move_newpage;
672
673        lock_page(page);
674    }
675
676    /*
677     * Only memory hotplug's offline_pages() caller has locked out KSM,
678     * and can safely migrate a KSM page. The other cases have skipped
679     * PageKsm along with PageReserved - but it is only now when we have
680     * the page lock that we can be certain it will not go KSM beneath us
681     * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
682     * its pagecount raised, but only here do we take the page lock which
683     * serializes that).
684     */
685    if (PageKsm(page) && !offlining) {
686        rc = -EBUSY;
687        goto unlock;
688    }
689
690    /* charge against new page */
691    charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
692    if (charge == -ENOMEM) {
693        rc = -ENOMEM;
694        goto unlock;
695    }
696    BUG_ON(charge);
697
698    if (PageWriteback(page)) {
699        /*
700         * For !sync, there is no point retrying as the retry loop
701         * is expected to be too short for PageWriteback to be cleared
702         */
703        if (!sync) {
704            rc = -EBUSY;
705            goto uncharge;
706        }
707        if (!force)
708            goto uncharge;
709        wait_on_page_writeback(page);
710    }
711    /*
712     * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
713     * we cannot notice that anon_vma is freed while we migrates a page.
714     * This get_anon_vma() delays freeing anon_vma pointer until the end
715     * of migration. File cache pages are no problem because of page_lock()
716     * File Caches may use write_page() or lock_page() in migration, then,
717     * just care Anon page here.
718     */
719    if (PageAnon(page)) {
720        /*
721         * Only page_lock_anon_vma() understands the subtleties of
722         * getting a hold on an anon_vma from outside one of its mms.
723         */
724        anon_vma = page_get_anon_vma(page);
725        if (anon_vma) {
726            /*
727             * Anon page
728             */
729        } else if (PageSwapCache(page)) {
730            /*
731             * We cannot be sure that the anon_vma of an unmapped
732             * swapcache page is safe to use because we don't
733             * know in advance if the VMA that this page belonged
734             * to still exists. If the VMA and others sharing the
735             * data have been freed, then the anon_vma could
736             * already be invalid.
737             *
738             * To avoid this possibility, swapcache pages get
739             * migrated but are not remapped when migration
740             * completes
741             */
742            remap_swapcache = 0;
743        } else {
744            goto uncharge;
745        }
746    }
747
748    /*
749     * Corner case handling:
750     * 1. When a new swap-cache page is read into, it is added to the LRU
751     * and treated as swapcache but it has no rmap yet.
752     * Calling try_to_unmap() against a page->mapping==NULL page will
753     * trigger a BUG. So handle it here.
754     * 2. An orphaned page (see truncate_complete_page) might have
755     * fs-private metadata. The page can be picked up due to memory
756     * offlining. Everywhere else except page reclaim, the page is
757     * invisible to the vm, so the page can not be migrated. So try to
758     * free the metadata, so the page can be freed.
759     */
760    if (!page->mapping) {
761        VM_BUG_ON(PageAnon(page));
762        if (page_has_private(page)) {
763            try_to_free_buffers(page);
764            goto uncharge;
765        }
766        goto skip_unmap;
767    }
768
769    /* Establish migration ptes or remove ptes */
770    try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
771
772skip_unmap:
773    if (!page_mapped(page))
774        rc = move_to_new_page(newpage, page, remap_swapcache, sync);
775
776    if (rc && remap_swapcache)
777        remove_migration_ptes(page, page);
778
779    /* Drop an anon_vma reference if we took one */
780    if (anon_vma)
781        put_anon_vma(anon_vma);
782
783uncharge:
784    if (!charge)
785        mem_cgroup_end_migration(mem, page, newpage, rc == 0);
786unlock:
787    unlock_page(page);
788
789move_newpage:
790    if (rc != -EAGAIN) {
791         /*
792          * A page that has been migrated has all references
793          * removed and will be freed. A page that has not been
794          * migrated will have kepts its references and be
795          * restored.
796          */
797         list_del(&page->lru);
798        dec_zone_page_state(page, NR_ISOLATED_ANON +
799                page_is_file_cache(page));
800        putback_lru_page(page);
801    }
802
803    /*
804     * Move the new page to the LRU. If migration was not successful
805     * then this will free the page.
806     */
807    putback_lru_page(newpage);
808
809    if (result) {
810        if (rc)
811            *result = rc;
812        else
813            *result = page_to_nid(newpage);
814    }
815    return rc;
816}
817
818/*
819 * Counterpart of unmap_and_move_page() for hugepage migration.
820 *
821 * This function doesn't wait the completion of hugepage I/O
822 * because there is no race between I/O and migration for hugepage.
823 * Note that currently hugepage I/O occurs only in direct I/O
824 * where no lock is held and PG_writeback is irrelevant,
825 * and writeback status of all subpages are counted in the reference
826 * count of the head page (i.e. if all subpages of a 2MB hugepage are
827 * under direct I/O, the reference of the head page is 512 and a bit more.)
828 * This means that when we try to migrate hugepage whose subpages are
829 * doing direct I/O, some references remain after try_to_unmap() and
830 * hugepage migration fails without data corruption.
831 *
832 * There is also no race when direct I/O is issued on the page under migration,
833 * because then pte is replaced with migration swap entry and direct I/O code
834 * will wait in the page fault for migration to complete.
835 */
836static int unmap_and_move_huge_page(new_page_t get_new_page,
837                unsigned long private, struct page *hpage,
838                int force, bool offlining, bool sync)
839{
840    int rc = 0;
841    int *result = NULL;
842    struct page *new_hpage = get_new_page(hpage, private, &result);
843    struct anon_vma *anon_vma = NULL;
844
845    if (!new_hpage)
846        return -ENOMEM;
847
848    rc = -EAGAIN;
849
850    if (!trylock_page(hpage)) {
851        if (!force || !sync)
852            goto out;
853        lock_page(hpage);
854    }
855
856    if (PageAnon(hpage))
857        anon_vma = page_get_anon_vma(hpage);
858
859    try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
860
861    if (!page_mapped(hpage))
862        rc = move_to_new_page(new_hpage, hpage, 1, sync);
863
864    if (rc)
865        remove_migration_ptes(hpage, hpage);
866
867    if (anon_vma)
868        put_anon_vma(anon_vma);
869out:
870    unlock_page(hpage);
871
872    if (rc != -EAGAIN) {
873        list_del(&hpage->lru);
874        put_page(hpage);
875    }
876
877    put_page(new_hpage);
878
879    if (result) {
880        if (rc)
881            *result = rc;
882        else
883            *result = page_to_nid(new_hpage);
884    }
885    return rc;
886}
887
888/*
889 * migrate_pages
890 *
891 * The function takes one list of pages to migrate and a function
892 * that determines from the page to be migrated and the private data
893 * the target of the move and allocates the page.
894 *
895 * The function returns after 10 attempts or if no pages
896 * are movable anymore because to has become empty
897 * or no retryable pages exist anymore.
898 * Caller should call putback_lru_pages to return pages to the LRU
899 * or free list only if ret != 0.
900 *
901 * Return: Number of pages not migrated or error code.
902 */
903int migrate_pages(struct list_head *from,
904        new_page_t get_new_page, unsigned long private, bool offlining,
905        bool sync)
906{
907    int retry = 1;
908    int nr_failed = 0;
909    int pass = 0;
910    struct page *page;
911    struct page *page2;
912    int swapwrite = current->flags & PF_SWAPWRITE;
913    int rc;
914
915    if (!swapwrite)
916        current->flags |= PF_SWAPWRITE;
917
918    for(pass = 0; pass < 10 && retry; pass++) {
919        retry = 0;
920
921        list_for_each_entry_safe(page, page2, from, lru) {
922            cond_resched();
923
924            rc = unmap_and_move(get_new_page, private,
925                        page, pass > 2, offlining,
926                        sync);
927
928            switch(rc) {
929            case -ENOMEM:
930                goto out;
931            case -EAGAIN:
932                retry++;
933                break;
934            case 0:
935                break;
936            default:
937                /* Permanent failure */
938                nr_failed++;
939                break;
940            }
941        }
942    }
943    rc = 0;
944out:
945    if (!swapwrite)
946        current->flags &= ~PF_SWAPWRITE;
947
948    if (rc)
949        return rc;
950
951    return nr_failed + retry;
952}
953
954int migrate_huge_pages(struct list_head *from,
955        new_page_t get_new_page, unsigned long private, bool offlining,
956        bool sync)
957{
958    int retry = 1;
959    int nr_failed = 0;
960    int pass = 0;
961    struct page *page;
962    struct page *page2;
963    int rc;
964
965    for (pass = 0; pass < 10 && retry; pass++) {
966        retry = 0;
967
968        list_for_each_entry_safe(page, page2, from, lru) {
969            cond_resched();
970
971            rc = unmap_and_move_huge_page(get_new_page,
972                    private, page, pass > 2, offlining,
973                    sync);
974
975            switch(rc) {
976            case -ENOMEM:
977                goto out;
978            case -EAGAIN:
979                retry++;
980                break;
981            case 0:
982                break;
983            default:
984                /* Permanent failure */
985                nr_failed++;
986                break;
987            }
988        }
989    }
990    rc = 0;
991out:
992    if (rc)
993        return rc;
994
995    return nr_failed + retry;
996}
997
998#ifdef CONFIG_NUMA
999/*
1000 * Move a list of individual pages
1001 */
1002struct page_to_node {
1003    unsigned long addr;
1004    struct page *page;
1005    int node;
1006    int status;
1007};
1008
1009static struct page *new_page_node(struct page *p, unsigned long private,
1010        int **result)
1011{
1012    struct page_to_node *pm = (struct page_to_node *)private;
1013
1014    while (pm->node != MAX_NUMNODES && pm->page != p)
1015        pm++;
1016
1017    if (pm->node == MAX_NUMNODES)
1018        return NULL;
1019
1020    *result = &pm->status;
1021
1022    return alloc_pages_exact_node(pm->node,
1023                GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
1024}
1025
1026/*
1027 * Move a set of pages as indicated in the pm array. The addr
1028 * field must be set to the virtual address of the page to be moved
1029 * and the node number must contain a valid target node.
1030 * The pm array ends with node = MAX_NUMNODES.
1031 */
1032static int do_move_page_to_node_array(struct mm_struct *mm,
1033                      struct page_to_node *pm,
1034                      int migrate_all)
1035{
1036    int err;
1037    struct page_to_node *pp;
1038    LIST_HEAD(pagelist);
1039
1040    down_read(&mm->mmap_sem);
1041
1042    /*
1043     * Build a list of pages to migrate
1044     */
1045    for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
1046        struct vm_area_struct *vma;
1047        struct page *page;
1048
1049        err = -EFAULT;
1050        vma = find_vma(mm, pp->addr);
1051        if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1052            goto set_status;
1053
1054        page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1055
1056        err = PTR_ERR(page);
1057        if (IS_ERR(page))
1058            goto set_status;
1059
1060        err = -ENOENT;
1061        if (!page)
1062            goto set_status;
1063
1064        /* Use PageReserved to check for zero page */
1065        if (PageReserved(page) || PageKsm(page))
1066            goto put_and_set;
1067
1068        pp->page = page;
1069        err = page_to_nid(page);
1070
1071        if (err == pp->node)
1072            /*
1073             * Node already in the right place
1074             */
1075            goto put_and_set;
1076
1077        err = -EACCES;
1078        if (page_mapcount(page) > 1 &&
1079                !migrate_all)
1080            goto put_and_set;
1081
1082        err = isolate_lru_page(page);
1083        if (!err) {
1084            list_add_tail(&page->lru, &pagelist);
1085            inc_zone_page_state(page, NR_ISOLATED_ANON +
1086                        page_is_file_cache(page));
1087        }
1088put_and_set:
1089        /*
1090         * Either remove the duplicate refcount from
1091         * isolate_lru_page() or drop the page ref if it was
1092         * not isolated.
1093         */
1094        put_page(page);
1095set_status:
1096        pp->status = err;
1097    }
1098
1099    err = 0;
1100    if (!list_empty(&pagelist)) {
1101        err = migrate_pages(&pagelist, new_page_node,
1102                (unsigned long)pm, 0, true);
1103        if (err)
1104            putback_lru_pages(&pagelist);
1105    }
1106
1107    up_read(&mm->mmap_sem);
1108    return err;
1109}
1110
1111/*
1112 * Migrate an array of page address onto an array of nodes and fill
1113 * the corresponding array of status.
1114 */
1115static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
1116             unsigned long nr_pages,
1117             const void __user * __user *pages,
1118             const int __user *nodes,
1119             int __user *status, int flags)
1120{
1121    struct page_to_node *pm;
1122    nodemask_t task_nodes;
1123    unsigned long chunk_nr_pages;
1124    unsigned long chunk_start;
1125    int err;
1126
1127    task_nodes = cpuset_mems_allowed(task);
1128
1129    err = -ENOMEM;
1130    pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1131    if (!pm)
1132        goto out;
1133
1134    migrate_prep();
1135
1136    /*
1137     * Store a chunk of page_to_node array in a page,
1138     * but keep the last one as a marker
1139     */
1140    chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
1141
1142    for (chunk_start = 0;
1143         chunk_start < nr_pages;
1144         chunk_start += chunk_nr_pages) {
1145        int j;
1146
1147        if (chunk_start + chunk_nr_pages > nr_pages)
1148            chunk_nr_pages = nr_pages - chunk_start;
1149
1150        /* fill the chunk pm with addrs and nodes from user-space */
1151        for (j = 0; j < chunk_nr_pages; j++) {
1152            const void __user *p;
1153            int node;
1154
1155            err = -EFAULT;
1156            if (get_user(p, pages + j + chunk_start))
1157                goto out_pm;
1158            pm[j].addr = (unsigned long) p;
1159
1160            if (get_user(node, nodes + j + chunk_start))
1161                goto out_pm;
1162
1163            err = -ENODEV;
1164            if (node < 0 || node >= MAX_NUMNODES)
1165                goto out_pm;
1166
1167            if (!node_state(node, N_HIGH_MEMORY))
1168                goto out_pm;
1169
1170            err = -EACCES;
1171            if (!node_isset(node, task_nodes))
1172                goto out_pm;
1173
1174            pm[j].node = node;
1175        }
1176
1177        /* End marker for this chunk */
1178        pm[chunk_nr_pages].node = MAX_NUMNODES;
1179
1180        /* Migrate this chunk */
1181        err = do_move_page_to_node_array(mm, pm,
1182                         flags & MPOL_MF_MOVE_ALL);
1183        if (err < 0)
1184            goto out_pm;
1185
1186        /* Return status information */
1187        for (j = 0; j < chunk_nr_pages; j++)
1188            if (put_user(pm[j].status, status + j + chunk_start)) {
1189                err = -EFAULT;
1190                goto out_pm;
1191            }
1192    }
1193    err = 0;
1194
1195out_pm:
1196    free_page((unsigned long)pm);
1197out:
1198    return err;
1199}
1200
1201/*
1202 * Determine the nodes of an array of pages and store it in an array of status.
1203 */
1204static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1205                const void __user **pages, int *status)
1206{
1207    unsigned long i;
1208
1209    down_read(&mm->mmap_sem);
1210
1211    for (i = 0; i < nr_pages; i++) {
1212        unsigned long addr = (unsigned long)(*pages);
1213        struct vm_area_struct *vma;
1214        struct page *page;
1215        int err = -EFAULT;
1216
1217        vma = find_vma(mm, addr);
1218        if (!vma || addr < vma->vm_start)
1219            goto set_status;
1220
1221        page = follow_page(vma, addr, 0);
1222
1223        err = PTR_ERR(page);
1224        if (IS_ERR(page))
1225            goto set_status;
1226
1227        err = -ENOENT;
1228        /* Use PageReserved to check for zero page */
1229        if (!page || PageReserved(page) || PageKsm(page))
1230            goto set_status;
1231
1232        err = page_to_nid(page);
1233set_status:
1234        *status = err;
1235
1236        pages++;
1237        status++;
1238    }
1239
1240    up_read(&mm->mmap_sem);
1241}
1242
1243/*
1244 * Determine the nodes of a user array of pages and store it in
1245 * a user array of status.
1246 */
1247static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
1248             const void __user * __user *pages,
1249             int __user *status)
1250{
1251#define DO_PAGES_STAT_CHUNK_NR 16
1252    const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
1253    int chunk_status[DO_PAGES_STAT_CHUNK_NR];
1254
1255    while (nr_pages) {
1256        unsigned long chunk_nr;
1257
1258        chunk_nr = nr_pages;
1259        if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
1260            chunk_nr = DO_PAGES_STAT_CHUNK_NR;
1261
1262        if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
1263            break;
1264
1265        do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
1266
1267        if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
1268            break;
1269
1270        pages += chunk_nr;
1271        status += chunk_nr;
1272        nr_pages -= chunk_nr;
1273    }
1274    return nr_pages ? -EFAULT : 0;
1275}
1276
1277/*
1278 * Move a list of pages in the address space of the currently executing
1279 * process.
1280 */
1281SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1282        const void __user * __user *, pages,
1283        const int __user *, nodes,
1284        int __user *, status, int, flags)
1285{
1286    const struct cred *cred = current_cred(), *tcred;
1287    struct task_struct *task;
1288    struct mm_struct *mm;
1289    int err;
1290
1291    /* Check flags */
1292    if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
1293        return -EINVAL;
1294
1295    if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1296        return -EPERM;
1297
1298    /* Find the mm_struct */
1299    rcu_read_lock();
1300    task = pid ? find_task_by_vpid(pid) : current;
1301    if (!task) {
1302        rcu_read_unlock();
1303        return -ESRCH;
1304    }
1305    mm = get_task_mm(task);
1306    rcu_read_unlock();
1307
1308    if (!mm)
1309        return -EINVAL;
1310
1311    /*
1312     * Check if this process has the right to modify the specified
1313     * process. The right exists if the process has administrative
1314     * capabilities, superuser privileges or the same
1315     * userid as the target process.
1316     */
1317    rcu_read_lock();
1318    tcred = __task_cred(task);
1319    if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1320        cred->uid != tcred->suid && cred->uid != tcred->uid &&
1321        !capable(CAP_SYS_NICE)) {
1322        rcu_read_unlock();
1323        err = -EPERM;
1324        goto out;
1325    }
1326    rcu_read_unlock();
1327
1328     err = security_task_movememory(task);
1329     if (err)
1330        goto out;
1331
1332    if (nodes) {
1333        err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1334                    flags);
1335    } else {
1336        err = do_pages_stat(mm, nr_pages, pages, status);
1337    }
1338
1339out:
1340    mmput(mm);
1341    return err;
1342}
1343
1344/*
1345 * Call migration functions in the vma_ops that may prepare
1346 * memory in a vm for migration. migration functions may perform
1347 * the migration for vmas that do not have an underlying page struct.
1348 */
1349int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1350    const nodemask_t *from, unsigned long flags)
1351{
1352     struct vm_area_struct *vma;
1353     int err = 0;
1354
1355    for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
1356         if (vma->vm_ops && vma->vm_ops->migrate) {
1357             err = vma->vm_ops->migrate(vma, to, from, flags);
1358             if (err)
1359                 break;
1360         }
1361     }
1362     return err;
1363}
1364#endif
1365

Archive Download this file



interactive