Root/mm/memory_hotplug.c

1/*
2 * linux/mm/memory_hotplug.c
3 *
4 * Copyright (C)
5 */
6
7#include <linux/stddef.h>
8#include <linux/mm.h>
9#include <linux/swap.h>
10#include <linux/interrupt.h>
11#include <linux/pagemap.h>
12#include <linux/bootmem.h>
13#include <linux/compiler.h>
14#include <linux/module.h>
15#include <linux/pagevec.h>
16#include <linux/writeback.h>
17#include <linux/slab.h>
18#include <linux/sysctl.h>
19#include <linux/cpu.h>
20#include <linux/memory.h>
21#include <linux/memory_hotplug.h>
22#include <linux/highmem.h>
23#include <linux/vmalloc.h>
24#include <linux/ioport.h>
25#include <linux/delay.h>
26#include <linux/migrate.h>
27#include <linux/page-isolation.h>
28#include <linux/pfn.h>
29#include <linux/suspend.h>
30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h>
32
33#include <asm/tlbflush.h>
34
35#include "internal.h"
36
37/* add this memory to iomem resource */
38static struct resource *register_memory_resource(u64 start, u64 size)
39{
40    struct resource *res;
41    res = kzalloc(sizeof(struct resource), GFP_KERNEL);
42    BUG_ON(!res);
43
44    res->name = "System RAM";
45    res->start = start;
46    res->end = start + size - 1;
47    res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
48    if (request_resource(&iomem_resource, res) < 0) {
49        printk("System RAM resource %llx - %llx cannot be added\n",
50        (unsigned long long)res->start, (unsigned long long)res->end);
51        kfree(res);
52        res = NULL;
53    }
54    return res;
55}
56
57static void release_memory_resource(struct resource *res)
58{
59    if (!res)
60        return;
61    release_resource(res);
62    kfree(res);
63    return;
64}
65
66#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
67#ifndef CONFIG_SPARSEMEM_VMEMMAP
68static void get_page_bootmem(unsigned long info, struct page *page, int type)
69{
70    atomic_set(&page->_mapcount, type);
71    SetPagePrivate(page);
72    set_page_private(page, info);
73    atomic_inc(&page->_count);
74}
75
76/* reference to __meminit __free_pages_bootmem is valid
77 * so use __ref to tell modpost not to generate a warning */
78void __ref put_page_bootmem(struct page *page)
79{
80    int type;
81
82    type = atomic_read(&page->_mapcount);
83    BUG_ON(type >= -1);
84
85    if (atomic_dec_return(&page->_count) == 1) {
86        ClearPagePrivate(page);
87        set_page_private(page, 0);
88        reset_page_mapcount(page);
89        __free_pages_bootmem(page, 0);
90    }
91
92}
93
94static void register_page_bootmem_info_section(unsigned long start_pfn)
95{
96    unsigned long *usemap, mapsize, section_nr, i;
97    struct mem_section *ms;
98    struct page *page, *memmap;
99
100    if (!pfn_valid(start_pfn))
101        return;
102
103    section_nr = pfn_to_section_nr(start_pfn);
104    ms = __nr_to_section(section_nr);
105
106    /* Get section's memmap address */
107    memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
108
109    /*
110     * Get page for the memmap's phys address
111     * XXX: need more consideration for sparse_vmemmap...
112     */
113    page = virt_to_page(memmap);
114    mapsize = sizeof(struct page) * PAGES_PER_SECTION;
115    mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
116
117    /* remember memmap's page */
118    for (i = 0; i < mapsize; i++, page++)
119        get_page_bootmem(section_nr, page, SECTION_INFO);
120
121    usemap = __nr_to_section(section_nr)->pageblock_flags;
122    page = virt_to_page(usemap);
123
124    mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
125
126    for (i = 0; i < mapsize; i++, page++)
127        get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
128
129}
130
131void register_page_bootmem_info_node(struct pglist_data *pgdat)
132{
133    unsigned long i, pfn, end_pfn, nr_pages;
134    int node = pgdat->node_id;
135    struct page *page;
136    struct zone *zone;
137
138    nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
139    page = virt_to_page(pgdat);
140
141    for (i = 0; i < nr_pages; i++, page++)
142        get_page_bootmem(node, page, NODE_INFO);
143
144    zone = &pgdat->node_zones[0];
145    for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
146        if (zone->wait_table) {
147            nr_pages = zone->wait_table_hash_nr_entries
148                * sizeof(wait_queue_head_t);
149            nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
150            page = virt_to_page(zone->wait_table);
151
152            for (i = 0; i < nr_pages; i++, page++)
153                get_page_bootmem(node, page, NODE_INFO);
154        }
155    }
156
157    pfn = pgdat->node_start_pfn;
158    end_pfn = pfn + pgdat->node_spanned_pages;
159
160    /* register_section info */
161    for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
162        register_page_bootmem_info_section(pfn);
163
164}
165#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
166
167static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
168               unsigned long end_pfn)
169{
170    unsigned long old_zone_end_pfn;
171
172    zone_span_writelock(zone);
173
174    old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
175    if (start_pfn < zone->zone_start_pfn)
176        zone->zone_start_pfn = start_pfn;
177
178    zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
179                zone->zone_start_pfn;
180
181    zone_span_writeunlock(zone);
182}
183
184static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
185                unsigned long end_pfn)
186{
187    unsigned long old_pgdat_end_pfn =
188        pgdat->node_start_pfn + pgdat->node_spanned_pages;
189
190    if (start_pfn < pgdat->node_start_pfn)
191        pgdat->node_start_pfn = start_pfn;
192
193    pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
194                    pgdat->node_start_pfn;
195}
196
197static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
198{
199    struct pglist_data *pgdat = zone->zone_pgdat;
200    int nr_pages = PAGES_PER_SECTION;
201    int nid = pgdat->node_id;
202    int zone_type;
203    unsigned long flags;
204
205    zone_type = zone - pgdat->node_zones;
206    if (!zone->wait_table) {
207        int ret;
208
209        ret = init_currently_empty_zone(zone, phys_start_pfn,
210                        nr_pages, MEMMAP_HOTPLUG);
211        if (ret)
212            return ret;
213    }
214    pgdat_resize_lock(zone->zone_pgdat, &flags);
215    grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
216    grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
217            phys_start_pfn + nr_pages);
218    pgdat_resize_unlock(zone->zone_pgdat, &flags);
219    memmap_init_zone(nr_pages, nid, zone_type,
220             phys_start_pfn, MEMMAP_HOTPLUG);
221    return 0;
222}
223
224static int __meminit __add_section(int nid, struct zone *zone,
225                    unsigned long phys_start_pfn)
226{
227    int nr_pages = PAGES_PER_SECTION;
228    int ret;
229
230    if (pfn_valid(phys_start_pfn))
231        return -EEXIST;
232
233    ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
234
235    if (ret < 0)
236        return ret;
237
238    ret = __add_zone(zone, phys_start_pfn);
239
240    if (ret < 0)
241        return ret;
242
243    return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
244}
245
246#ifdef CONFIG_SPARSEMEM_VMEMMAP
247static int __remove_section(struct zone *zone, struct mem_section *ms)
248{
249    /*
250     * XXX: Freeing memmap with vmemmap is not implement yet.
251     * This should be removed later.
252     */
253    return -EBUSY;
254}
255#else
256static int __remove_section(struct zone *zone, struct mem_section *ms)
257{
258    unsigned long flags;
259    struct pglist_data *pgdat = zone->zone_pgdat;
260    int ret = -EINVAL;
261
262    if (!valid_section(ms))
263        return ret;
264
265    ret = unregister_memory_section(ms);
266    if (ret)
267        return ret;
268
269    pgdat_resize_lock(pgdat, &flags);
270    sparse_remove_one_section(zone, ms);
271    pgdat_resize_unlock(pgdat, &flags);
272    return 0;
273}
274#endif
275
276/*
277 * Reasonably generic function for adding memory. It is
278 * expected that archs that support memory hotplug will
279 * call this function after deciding the zone to which to
280 * add the new pages.
281 */
282int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
283            unsigned long nr_pages)
284{
285    unsigned long i;
286    int err = 0;
287    int start_sec, end_sec;
288    /* during initialize mem_map, align hot-added range to section */
289    start_sec = pfn_to_section_nr(phys_start_pfn);
290    end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
291
292    for (i = start_sec; i <= end_sec; i++) {
293        err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
294
295        /*
296         * EEXIST is finally dealt with by ioresource collision
297         * check. see add_memory() => register_memory_resource()
298         * Warning will be printed if there is collision.
299         */
300        if (err && (err != -EEXIST))
301            break;
302        err = 0;
303    }
304
305    return err;
306}
307EXPORT_SYMBOL_GPL(__add_pages);
308
309/**
310 * __remove_pages() - remove sections of pages from a zone
311 * @zone: zone from which pages need to be removed
312 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
313 * @nr_pages: number of pages to remove (must be multiple of section size)
314 *
315 * Generic helper function to remove section mappings and sysfs entries
316 * for the section of the memory we are removing. Caller needs to make
317 * sure that pages are marked reserved and zones are adjust properly by
318 * calling offline_pages().
319 */
320int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
321         unsigned long nr_pages)
322{
323    unsigned long i, ret = 0;
324    int sections_to_remove;
325
326    /*
327     * We can only remove entire sections
328     */
329    BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
330    BUG_ON(nr_pages % PAGES_PER_SECTION);
331
332    sections_to_remove = nr_pages / PAGES_PER_SECTION;
333    for (i = 0; i < sections_to_remove; i++) {
334        unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
335        release_mem_region(pfn << PAGE_SHIFT,
336                   PAGES_PER_SECTION << PAGE_SHIFT);
337        ret = __remove_section(zone, __pfn_to_section(pfn));
338        if (ret)
339            break;
340    }
341    return ret;
342}
343EXPORT_SYMBOL_GPL(__remove_pages);
344
345void online_page(struct page *page)
346{
347    unsigned long pfn = page_to_pfn(page);
348
349    totalram_pages++;
350    if (pfn >= num_physpages)
351        num_physpages = pfn + 1;
352
353#ifdef CONFIG_HIGHMEM
354    if (PageHighMem(page))
355        totalhigh_pages++;
356#endif
357
358#ifdef CONFIG_FLATMEM
359    max_mapnr = max(page_to_pfn(page), max_mapnr);
360#endif
361
362    ClearPageReserved(page);
363    init_page_count(page);
364    __free_page(page);
365}
366
367static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
368            void *arg)
369{
370    unsigned long i;
371    unsigned long onlined_pages = *(unsigned long *)arg;
372    struct page *page;
373    if (PageReserved(pfn_to_page(start_pfn)))
374        for (i = 0; i < nr_pages; i++) {
375            page = pfn_to_page(start_pfn + i);
376            online_page(page);
377            onlined_pages++;
378        }
379    *(unsigned long *)arg = onlined_pages;
380    return 0;
381}
382
383
384int online_pages(unsigned long pfn, unsigned long nr_pages)
385{
386    unsigned long onlined_pages = 0;
387    struct zone *zone;
388    int need_zonelists_rebuild = 0;
389    int nid;
390    int ret;
391    struct memory_notify arg;
392
393    arg.start_pfn = pfn;
394    arg.nr_pages = nr_pages;
395    arg.status_change_nid = -1;
396
397    nid = page_to_nid(pfn_to_page(pfn));
398    if (node_present_pages(nid) == 0)
399        arg.status_change_nid = nid;
400
401    ret = memory_notify(MEM_GOING_ONLINE, &arg);
402    ret = notifier_to_errno(ret);
403    if (ret) {
404        memory_notify(MEM_CANCEL_ONLINE, &arg);
405        return ret;
406    }
407    /*
408     * This doesn't need a lock to do pfn_to_page().
409     * The section can't be removed here because of the
410     * memory_block->state_mutex.
411     */
412    zone = page_zone(pfn_to_page(pfn));
413    /*
414     * If this zone is not populated, then it is not in zonelist.
415     * This means the page allocator ignores this zone.
416     * So, zonelist must be updated after online.
417     */
418    mutex_lock(&zonelists_mutex);
419    if (!populated_zone(zone))
420        need_zonelists_rebuild = 1;
421
422    ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
423        online_pages_range);
424    if (ret) {
425        mutex_unlock(&zonelists_mutex);
426        printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
427            nr_pages, pfn);
428        memory_notify(MEM_CANCEL_ONLINE, &arg);
429        return ret;
430    }
431
432    zone->present_pages += onlined_pages;
433    zone->zone_pgdat->node_present_pages += onlined_pages;
434    if (need_zonelists_rebuild)
435        build_all_zonelists(zone);
436    else
437        zone_pcp_update(zone);
438
439    mutex_unlock(&zonelists_mutex);
440    setup_per_zone_wmarks();
441    calculate_zone_inactive_ratio(zone);
442    if (onlined_pages) {
443        kswapd_run(zone_to_nid(zone));
444        node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
445    }
446
447    vm_total_pages = nr_free_pagecache_pages();
448
449    writeback_set_ratelimit();
450
451    if (onlined_pages)
452        memory_notify(MEM_ONLINE, &arg);
453
454    return 0;
455}
456#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
457
458/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
459static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
460{
461    struct pglist_data *pgdat;
462    unsigned long zones_size[MAX_NR_ZONES] = {0};
463    unsigned long zholes_size[MAX_NR_ZONES] = {0};
464    unsigned long start_pfn = start >> PAGE_SHIFT;
465
466    pgdat = arch_alloc_nodedata(nid);
467    if (!pgdat)
468        return NULL;
469
470    arch_refresh_nodedata(nid, pgdat);
471
472    /* we can use NODE_DATA(nid) from here */
473
474    /* init node's zones as empty zones, we don't have any present pages.*/
475    free_area_init_node(nid, zones_size, start_pfn, zholes_size);
476
477    return pgdat;
478}
479
480static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
481{
482    arch_refresh_nodedata(nid, NULL);
483    arch_free_nodedata(pgdat);
484    return;
485}
486
487
488/*
489 * called by cpu_up() to online a node without onlined memory.
490 */
491int mem_online_node(int nid)
492{
493    pg_data_t *pgdat;
494    int ret;
495
496    lock_system_sleep();
497    pgdat = hotadd_new_pgdat(nid, 0);
498    if (pgdat) {
499        ret = -ENOMEM;
500        goto out;
501    }
502    node_set_online(nid);
503    ret = register_one_node(nid);
504    BUG_ON(ret);
505
506out:
507    unlock_system_sleep();
508    return ret;
509}
510
511/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
512int __ref add_memory(int nid, u64 start, u64 size)
513{
514    pg_data_t *pgdat = NULL;
515    int new_pgdat = 0;
516    struct resource *res;
517    int ret;
518
519    lock_system_sleep();
520
521    res = register_memory_resource(start, size);
522    ret = -EEXIST;
523    if (!res)
524        goto out;
525
526    if (!node_online(nid)) {
527        pgdat = hotadd_new_pgdat(nid, start);
528        ret = -ENOMEM;
529        if (!pgdat)
530            goto out;
531        new_pgdat = 1;
532    }
533
534    /* call arch's memory hotadd */
535    ret = arch_add_memory(nid, start, size);
536
537    if (ret < 0)
538        goto error;
539
540    /* we online node here. we can't roll back from here. */
541    node_set_online(nid);
542
543    if (new_pgdat) {
544        ret = register_one_node(nid);
545        /*
546         * If sysfs file of new node can't create, cpu on the node
547         * can't be hot-added. There is no rollback way now.
548         * So, check by BUG_ON() to catch it reluctantly..
549         */
550        BUG_ON(ret);
551    }
552
553    /* create new memmap entry */
554    firmware_map_add_hotplug(start, start + size, "System RAM");
555
556    goto out;
557
558error:
559    /* rollback pgdat allocation and others */
560    if (new_pgdat)
561        rollback_node_hotadd(nid, pgdat);
562    if (res)
563        release_memory_resource(res);
564
565out:
566    unlock_system_sleep();
567    return ret;
568}
569EXPORT_SYMBOL_GPL(add_memory);
570
571#ifdef CONFIG_MEMORY_HOTREMOVE
572/*
573 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
574 * set and the size of the free page is given by page_order(). Using this,
575 * the function determines if the pageblock contains only free pages.
576 * Due to buddy contraints, a free page at least the size of a pageblock will
577 * be located at the start of the pageblock
578 */
579static inline int pageblock_free(struct page *page)
580{
581    return PageBuddy(page) && page_order(page) >= pageblock_order;
582}
583
584/* Return the start of the next active pageblock after a given page */
585static struct page *next_active_pageblock(struct page *page)
586{
587    /* Ensure the starting page is pageblock-aligned */
588    BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
589
590    /* If the entire pageblock is free, move to the end of free page */
591    if (pageblock_free(page)) {
592        int order;
593        /* be careful. we don't have locks, page_order can be changed.*/
594        order = page_order(page);
595        if ((order < MAX_ORDER) && (order >= pageblock_order))
596            return page + (1 << order);
597    }
598
599    return page + pageblock_nr_pages;
600}
601
602/* Checks if this range of memory is likely to be hot-removable. */
603int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
604{
605    int type;
606    struct page *page = pfn_to_page(start_pfn);
607    struct page *end_page = page + nr_pages;
608
609    /* Check the starting page of each pageblock within the range */
610    for (; page < end_page; page = next_active_pageblock(page)) {
611        type = get_pageblock_migratetype(page);
612
613        /*
614         * A pageblock containing MOVABLE or free pages is considered
615         * removable
616         */
617        if (type != MIGRATE_MOVABLE && !pageblock_free(page))
618            return 0;
619
620        /*
621         * A pageblock starting with a PageReserved page is not
622         * considered removable.
623         */
624        if (PageReserved(page))
625            return 0;
626    }
627
628    /* All pageblocks in the memory block are likely to be hot-removable */
629    return 1;
630}
631
632/*
633 * Confirm all pages in a range [start, end) is belongs to the same zone.
634 */
635static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
636{
637    unsigned long pfn;
638    struct zone *zone = NULL;
639    struct page *page;
640    int i;
641    for (pfn = start_pfn;
642         pfn < end_pfn;
643         pfn += MAX_ORDER_NR_PAGES) {
644        i = 0;
645        /* This is just a CONFIG_HOLES_IN_ZONE check.*/
646        while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
647            i++;
648        if (i == MAX_ORDER_NR_PAGES)
649            continue;
650        page = pfn_to_page(pfn + i);
651        if (zone && page_zone(page) != zone)
652            return 0;
653        zone = page_zone(page);
654    }
655    return 1;
656}
657
658/*
659 * Scanning pfn is much easier than scanning lru list.
660 * Scan pfn from start to end and Find LRU page.
661 */
662int scan_lru_pages(unsigned long start, unsigned long end)
663{
664    unsigned long pfn;
665    struct page *page;
666    for (pfn = start; pfn < end; pfn++) {
667        if (pfn_valid(pfn)) {
668            page = pfn_to_page(pfn);
669            if (PageLRU(page))
670                return pfn;
671        }
672    }
673    return 0;
674}
675
676static struct page *
677hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
678{
679    /* This should be improooooved!! */
680    return alloc_page(GFP_HIGHUSER_MOVABLE);
681}
682
683#define NR_OFFLINE_AT_ONCE_PAGES (256)
684static int
685do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
686{
687    unsigned long pfn;
688    struct page *page;
689    int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
690    int not_managed = 0;
691    int ret = 0;
692    LIST_HEAD(source);
693
694    for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
695        if (!pfn_valid(pfn))
696            continue;
697        page = pfn_to_page(pfn);
698        if (!page_count(page))
699            continue;
700        /*
701         * We can skip free pages. And we can only deal with pages on
702         * LRU.
703         */
704        ret = isolate_lru_page(page);
705        if (!ret) { /* Success */
706            list_add_tail(&page->lru, &source);
707            move_pages--;
708            inc_zone_page_state(page, NR_ISOLATED_ANON +
709                        page_is_file_cache(page));
710
711        } else {
712            /* Becasue we don't have big zone->lock. we should
713               check this again here. */
714            if (page_count(page))
715                not_managed++;
716#ifdef CONFIG_DEBUG_VM
717            printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
718                   pfn);
719            dump_page(page);
720#endif
721        }
722    }
723    ret = -EBUSY;
724    if (not_managed) {
725        if (!list_empty(&source))
726            putback_lru_pages(&source);
727        goto out;
728    }
729    ret = 0;
730    if (list_empty(&source))
731        goto out;
732    /* this function returns # of failed pages */
733    ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
734
735out:
736    return ret;
737}
738
739/*
740 * remove from free_area[] and mark all as Reserved.
741 */
742static int
743offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
744            void *data)
745{
746    __offline_isolated_pages(start, start + nr_pages);
747    return 0;
748}
749
750static void
751offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
752{
753    walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
754                offline_isolated_pages_cb);
755}
756
757/*
758 * Check all pages in range, recoreded as memory resource, are isolated.
759 */
760static int
761check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
762            void *data)
763{
764    int ret;
765    long offlined = *(long *)data;
766    ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
767    offlined = nr_pages;
768    if (!ret)
769        *(long *)data += offlined;
770    return ret;
771}
772
773static long
774check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
775{
776    long offlined = 0;
777    int ret;
778
779    ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
780            check_pages_isolated_cb);
781    if (ret < 0)
782        offlined = (long)ret;
783    return offlined;
784}
785
786static int offline_pages(unsigned long start_pfn,
787          unsigned long end_pfn, unsigned long timeout)
788{
789    unsigned long pfn, nr_pages, expire;
790    long offlined_pages;
791    int ret, drain, retry_max, node;
792    struct zone *zone;
793    struct memory_notify arg;
794
795    BUG_ON(start_pfn >= end_pfn);
796    /* at least, alignment against pageblock is necessary */
797    if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
798        return -EINVAL;
799    if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
800        return -EINVAL;
801    /* This makes hotplug much easier...and readable.
802       we assume this for now. .*/
803    if (!test_pages_in_a_zone(start_pfn, end_pfn))
804        return -EINVAL;
805
806    lock_system_sleep();
807
808    zone = page_zone(pfn_to_page(start_pfn));
809    node = zone_to_nid(zone);
810    nr_pages = end_pfn - start_pfn;
811
812    /* set above range as isolated */
813    ret = start_isolate_page_range(start_pfn, end_pfn);
814    if (ret)
815        goto out;
816
817    arg.start_pfn = start_pfn;
818    arg.nr_pages = nr_pages;
819    arg.status_change_nid = -1;
820    if (nr_pages >= node_present_pages(node))
821        arg.status_change_nid = node;
822
823    ret = memory_notify(MEM_GOING_OFFLINE, &arg);
824    ret = notifier_to_errno(ret);
825    if (ret)
826        goto failed_removal;
827
828    pfn = start_pfn;
829    expire = jiffies + timeout;
830    drain = 0;
831    retry_max = 5;
832repeat:
833    /* start memory hot removal */
834    ret = -EAGAIN;
835    if (time_after(jiffies, expire))
836        goto failed_removal;
837    ret = -EINTR;
838    if (signal_pending(current))
839        goto failed_removal;
840    ret = 0;
841    if (drain) {
842        lru_add_drain_all();
843        flush_scheduled_work();
844        cond_resched();
845        drain_all_pages();
846    }
847
848    pfn = scan_lru_pages(start_pfn, end_pfn);
849    if (pfn) { /* We have page on LRU */
850        ret = do_migrate_range(pfn, end_pfn);
851        if (!ret) {
852            drain = 1;
853            goto repeat;
854        } else {
855            if (ret < 0)
856                if (--retry_max == 0)
857                    goto failed_removal;
858            yield();
859            drain = 1;
860            goto repeat;
861        }
862    }
863    /* drain all zone's lru pagevec, this is asyncronous... */
864    lru_add_drain_all();
865    flush_scheduled_work();
866    yield();
867    /* drain pcp pages , this is synchrouns. */
868    drain_all_pages();
869    /* check again */
870    offlined_pages = check_pages_isolated(start_pfn, end_pfn);
871    if (offlined_pages < 0) {
872        ret = -EBUSY;
873        goto failed_removal;
874    }
875    printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
876    /* Ok, all of our target is islaoted.
877       We cannot do rollback at this point. */
878    offline_isolated_pages(start_pfn, end_pfn);
879    /* reset pagetype flags and makes migrate type to be MOVABLE */
880    undo_isolate_page_range(start_pfn, end_pfn);
881    /* removal success */
882    zone->present_pages -= offlined_pages;
883    zone->zone_pgdat->node_present_pages -= offlined_pages;
884    totalram_pages -= offlined_pages;
885
886    setup_per_zone_wmarks();
887    calculate_zone_inactive_ratio(zone);
888    if (!node_present_pages(node)) {
889        node_clear_state(node, N_HIGH_MEMORY);
890        kswapd_stop(node);
891    }
892
893    vm_total_pages = nr_free_pagecache_pages();
894    writeback_set_ratelimit();
895
896    memory_notify(MEM_OFFLINE, &arg);
897    unlock_system_sleep();
898    return 0;
899
900failed_removal:
901    printk(KERN_INFO "memory offlining %lx to %lx failed\n",
902        start_pfn, end_pfn);
903    memory_notify(MEM_CANCEL_OFFLINE, &arg);
904    /* pushback to free area */
905    undo_isolate_page_range(start_pfn, end_pfn);
906
907out:
908    unlock_system_sleep();
909    return ret;
910}
911
912int remove_memory(u64 start, u64 size)
913{
914    unsigned long start_pfn, end_pfn;
915
916    start_pfn = PFN_DOWN(start);
917    end_pfn = start_pfn + PFN_DOWN(size);
918    return offline_pages(start_pfn, end_pfn, 120 * HZ);
919}
920#else
921int remove_memory(u64 start, u64 size)
922{
923    return -EINVAL;
924}
925#endif /* CONFIG_MEMORY_HOTREMOVE */
926EXPORT_SYMBOL_GPL(remove_memory);
927

Archive Download this file



interactive