Root/mm/hugetlb.c

1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/list.h>
6#include <linux/init.h>
7#include <linux/module.h>
8#include <linux/mm.h>
9#include <linux/seq_file.h>
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/mmu_notifier.h>
13#include <linux/nodemask.h>
14#include <linux/pagemap.h>
15#include <linux/mempolicy.h>
16#include <linux/cpuset.h>
17#include <linux/mutex.h>
18#include <linux/bootmem.h>
19#include <linux/sysfs.h>
20#include <linux/slab.h>
21#include <linux/rmap.h>
22#include <linux/swap.h>
23#include <linux/swapops.h>
24
25#include <asm/page.h>
26#include <asm/pgtable.h>
27#include <asm/io.h>
28
29#include <linux/hugetlb.h>
30#include <linux/node.h>
31#include "internal.h"
32
33const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable;
36
37static int max_hstate;
38unsigned int default_hstate_idx;
39struct hstate hstates[HUGE_MAX_HSTATE];
40
41__initdata LIST_HEAD(huge_boot_pages);
42
43/* for command line parsing */
44static struct hstate * __initdata parsed_hstate;
45static unsigned long __initdata default_hstate_max_huge_pages;
46static unsigned long __initdata default_hstate_size;
47
48#define for_each_hstate(h) \
49    for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
50
51/*
52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
53 */
54static DEFINE_SPINLOCK(hugetlb_lock);
55
56/*
57 * Region tracking -- allows tracking of reservations and instantiated pages
58 * across the pages in a mapping.
59 *
60 * The region data structures are protected by a combination of the mmap_sem
61 * and the hugetlb_instantion_mutex. To access or modify a region the caller
62 * must either hold the mmap_sem for write, or the mmap_sem for read and
63 * the hugetlb_instantiation mutex:
64 *
65 * down_write(&mm->mmap_sem);
66 * or
67 * down_read(&mm->mmap_sem);
68 * mutex_lock(&hugetlb_instantiation_mutex);
69 */
70struct file_region {
71    struct list_head link;
72    long from;
73    long to;
74};
75
76static long region_add(struct list_head *head, long f, long t)
77{
78    struct file_region *rg, *nrg, *trg;
79
80    /* Locate the region we are either in or before. */
81    list_for_each_entry(rg, head, link)
82        if (f <= rg->to)
83            break;
84
85    /* Round our left edge to the current segment if it encloses us. */
86    if (f > rg->from)
87        f = rg->from;
88
89    /* Check for and consume any regions we now overlap with. */
90    nrg = rg;
91    list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
92        if (&rg->link == head)
93            break;
94        if (rg->from > t)
95            break;
96
97        /* If this area reaches higher then extend our area to
98         * include it completely. If this is not the first area
99         * which we intend to reuse, free it. */
100        if (rg->to > t)
101            t = rg->to;
102        if (rg != nrg) {
103            list_del(&rg->link);
104            kfree(rg);
105        }
106    }
107    nrg->from = f;
108    nrg->to = t;
109    return 0;
110}
111
112static long region_chg(struct list_head *head, long f, long t)
113{
114    struct file_region *rg, *nrg;
115    long chg = 0;
116
117    /* Locate the region we are before or in. */
118    list_for_each_entry(rg, head, link)
119        if (f <= rg->to)
120            break;
121
122    /* If we are below the current region then a new region is required.
123     * Subtle, allocate a new region at the position but make it zero
124     * size such that we can guarantee to record the reservation. */
125    if (&rg->link == head || t < rg->from) {
126        nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
127        if (!nrg)
128            return -ENOMEM;
129        nrg->from = f;
130        nrg->to = f;
131        INIT_LIST_HEAD(&nrg->link);
132        list_add(&nrg->link, rg->link.prev);
133
134        return t - f;
135    }
136
137    /* Round our left edge to the current segment if it encloses us. */
138    if (f > rg->from)
139        f = rg->from;
140    chg = t - f;
141
142    /* Check for and consume any regions we now overlap with. */
143    list_for_each_entry(rg, rg->link.prev, link) {
144        if (&rg->link == head)
145            break;
146        if (rg->from > t)
147            return chg;
148
149        /* We overlap with this area, if it extends futher than
150         * us then we must extend ourselves. Account for its
151         * existing reservation. */
152        if (rg->to > t) {
153            chg += rg->to - t;
154            t = rg->to;
155        }
156        chg -= rg->to - rg->from;
157    }
158    return chg;
159}
160
161static long region_truncate(struct list_head *head, long end)
162{
163    struct file_region *rg, *trg;
164    long chg = 0;
165
166    /* Locate the region we are either in or before. */
167    list_for_each_entry(rg, head, link)
168        if (end <= rg->to)
169            break;
170    if (&rg->link == head)
171        return 0;
172
173    /* If we are in the middle of a region then adjust it. */
174    if (end > rg->from) {
175        chg = rg->to - end;
176        rg->to = end;
177        rg = list_entry(rg->link.next, typeof(*rg), link);
178    }
179
180    /* Drop any remaining regions. */
181    list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
182        if (&rg->link == head)
183            break;
184        chg += rg->to - rg->from;
185        list_del(&rg->link);
186        kfree(rg);
187    }
188    return chg;
189}
190
191static long region_count(struct list_head *head, long f, long t)
192{
193    struct file_region *rg;
194    long chg = 0;
195
196    /* Locate each segment we overlap with, and count that overlap. */
197    list_for_each_entry(rg, head, link) {
198        int seg_from;
199        int seg_to;
200
201        if (rg->to <= f)
202            continue;
203        if (rg->from >= t)
204            break;
205
206        seg_from = max(rg->from, f);
207        seg_to = min(rg->to, t);
208
209        chg += seg_to - seg_from;
210    }
211
212    return chg;
213}
214
215/*
216 * Convert the address within this vma to the page offset within
217 * the mapping, in pagecache page units; huge pages here.
218 */
219static pgoff_t vma_hugecache_offset(struct hstate *h,
220            struct vm_area_struct *vma, unsigned long address)
221{
222    return ((address - vma->vm_start) >> huge_page_shift(h)) +
223            (vma->vm_pgoff >> huge_page_order(h));
224}
225
226pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
227                     unsigned long address)
228{
229    return vma_hugecache_offset(hstate_vma(vma), vma, address);
230}
231
232/*
233 * Return the size of the pages allocated when backing a VMA. In the majority
234 * cases this will be same size as used by the page table entries.
235 */
236unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
237{
238    struct hstate *hstate;
239
240    if (!is_vm_hugetlb_page(vma))
241        return PAGE_SIZE;
242
243    hstate = hstate_vma(vma);
244
245    return 1UL << (hstate->order + PAGE_SHIFT);
246}
247EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
248
249/*
250 * Return the page size being used by the MMU to back a VMA. In the majority
251 * of cases, the page size used by the kernel matches the MMU size. On
252 * architectures where it differs, an architecture-specific version of this
253 * function is required.
254 */
255#ifndef vma_mmu_pagesize
256unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
257{
258    return vma_kernel_pagesize(vma);
259}
260#endif
261
262/*
263 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
264 * bits of the reservation map pointer, which are always clear due to
265 * alignment.
266 */
267#define HPAGE_RESV_OWNER (1UL << 0)
268#define HPAGE_RESV_UNMAPPED (1UL << 1)
269#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
270
271/*
272 * These helpers are used to track how many pages are reserved for
273 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
274 * is guaranteed to have their future faults succeed.
275 *
276 * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
277 * the reserve counters are updated with the hugetlb_lock held. It is safe
278 * to reset the VMA at fork() time as it is not in use yet and there is no
279 * chance of the global counters getting corrupted as a result of the values.
280 *
281 * The private mapping reservation is represented in a subtly different
282 * manner to a shared mapping. A shared mapping has a region map associated
283 * with the underlying file, this region map represents the backing file
284 * pages which have ever had a reservation assigned which this persists even
285 * after the page is instantiated. A private mapping has a region map
286 * associated with the original mmap which is attached to all VMAs which
287 * reference it, this region map represents those offsets which have consumed
288 * reservation ie. where pages have been instantiated.
289 */
290static unsigned long get_vma_private_data(struct vm_area_struct *vma)
291{
292    return (unsigned long)vma->vm_private_data;
293}
294
295static void set_vma_private_data(struct vm_area_struct *vma,
296                            unsigned long value)
297{
298    vma->vm_private_data = (void *)value;
299}
300
301struct resv_map {
302    struct kref refs;
303    struct list_head regions;
304};
305
306static struct resv_map *resv_map_alloc(void)
307{
308    struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
309    if (!resv_map)
310        return NULL;
311
312    kref_init(&resv_map->refs);
313    INIT_LIST_HEAD(&resv_map->regions);
314
315    return resv_map;
316}
317
318static void resv_map_release(struct kref *ref)
319{
320    struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
321
322    /* Clear out any active regions before we release the map. */
323    region_truncate(&resv_map->regions, 0);
324    kfree(resv_map);
325}
326
327static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
328{
329    VM_BUG_ON(!is_vm_hugetlb_page(vma));
330    if (!(vma->vm_flags & VM_MAYSHARE))
331        return (struct resv_map *)(get_vma_private_data(vma) &
332                            ~HPAGE_RESV_MASK);
333    return NULL;
334}
335
336static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
337{
338    VM_BUG_ON(!is_vm_hugetlb_page(vma));
339    VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
340
341    set_vma_private_data(vma, (get_vma_private_data(vma) &
342                HPAGE_RESV_MASK) | (unsigned long)map);
343}
344
345static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
346{
347    VM_BUG_ON(!is_vm_hugetlb_page(vma));
348    VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
349
350    set_vma_private_data(vma, get_vma_private_data(vma) | flags);
351}
352
353static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
354{
355    VM_BUG_ON(!is_vm_hugetlb_page(vma));
356
357    return (get_vma_private_data(vma) & flag) != 0;
358}
359
360/* Decrement the reserved pages in the hugepage pool by one */
361static void decrement_hugepage_resv_vma(struct hstate *h,
362            struct vm_area_struct *vma)
363{
364    if (vma->vm_flags & VM_NORESERVE)
365        return;
366
367    if (vma->vm_flags & VM_MAYSHARE) {
368        /* Shared mappings always use reserves */
369        h->resv_huge_pages--;
370    } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
371        /*
372         * Only the process that called mmap() has reserves for
373         * private mappings.
374         */
375        h->resv_huge_pages--;
376    }
377}
378
379/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
380void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
381{
382    VM_BUG_ON(!is_vm_hugetlb_page(vma));
383    if (!(vma->vm_flags & VM_MAYSHARE))
384        vma->vm_private_data = (void *)0;
385}
386
387/* Returns true if the VMA has associated reserve pages */
388static int vma_has_reserves(struct vm_area_struct *vma)
389{
390    if (vma->vm_flags & VM_MAYSHARE)
391        return 1;
392    if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
393        return 1;
394    return 0;
395}
396
397static void clear_gigantic_page(struct page *page,
398            unsigned long addr, unsigned long sz)
399{
400    int i;
401    struct page *p = page;
402
403    might_sleep();
404    for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
405        cond_resched();
406        clear_user_highpage(p, addr + i * PAGE_SIZE);
407    }
408}
409static void clear_huge_page(struct page *page,
410            unsigned long addr, unsigned long sz)
411{
412    int i;
413
414    if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
415        clear_gigantic_page(page, addr, sz);
416        return;
417    }
418
419    might_sleep();
420    for (i = 0; i < sz/PAGE_SIZE; i++) {
421        cond_resched();
422        clear_user_highpage(page + i, addr + i * PAGE_SIZE);
423    }
424}
425
426static void copy_gigantic_page(struct page *dst, struct page *src,
427               unsigned long addr, struct vm_area_struct *vma)
428{
429    int i;
430    struct hstate *h = hstate_vma(vma);
431    struct page *dst_base = dst;
432    struct page *src_base = src;
433    might_sleep();
434    for (i = 0; i < pages_per_huge_page(h); ) {
435        cond_resched();
436        copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
437
438        i++;
439        dst = mem_map_next(dst, dst_base, i);
440        src = mem_map_next(src, src_base, i);
441    }
442}
443static void copy_huge_page(struct page *dst, struct page *src,
444               unsigned long addr, struct vm_area_struct *vma)
445{
446    int i;
447    struct hstate *h = hstate_vma(vma);
448
449    if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
450        copy_gigantic_page(dst, src, addr, vma);
451        return;
452    }
453
454    might_sleep();
455    for (i = 0; i < pages_per_huge_page(h); i++) {
456        cond_resched();
457        copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
458    }
459}
460
461static void enqueue_huge_page(struct hstate *h, struct page *page)
462{
463    int nid = page_to_nid(page);
464    list_add(&page->lru, &h->hugepage_freelists[nid]);
465    h->free_huge_pages++;
466    h->free_huge_pages_node[nid]++;
467}
468
469static struct page *dequeue_huge_page_vma(struct hstate *h,
470                struct vm_area_struct *vma,
471                unsigned long address, int avoid_reserve)
472{
473    int nid;
474    struct page *page = NULL;
475    struct mempolicy *mpol;
476    nodemask_t *nodemask;
477    struct zonelist *zonelist;
478    struct zone *zone;
479    struct zoneref *z;
480
481    get_mems_allowed();
482    zonelist = huge_zonelist(vma, address,
483                    htlb_alloc_mask, &mpol, &nodemask);
484    /*
485     * A child process with MAP_PRIVATE mappings created by their parent
486     * have no page reserves. This check ensures that reservations are
487     * not "stolen". The child may still get SIGKILLed
488     */
489    if (!vma_has_reserves(vma) &&
490            h->free_huge_pages - h->resv_huge_pages == 0)
491        goto err;
492
493    /* If reserves cannot be used, ensure enough pages are in the pool */
494    if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
495        goto err;;
496
497    for_each_zone_zonelist_nodemask(zone, z, zonelist,
498                        MAX_NR_ZONES - 1, nodemask) {
499        nid = zone_to_nid(zone);
500        if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
501            !list_empty(&h->hugepage_freelists[nid])) {
502            page = list_entry(h->hugepage_freelists[nid].next,
503                      struct page, lru);
504            list_del(&page->lru);
505            h->free_huge_pages--;
506            h->free_huge_pages_node[nid]--;
507
508            if (!avoid_reserve)
509                decrement_hugepage_resv_vma(h, vma);
510
511            break;
512        }
513    }
514err:
515    mpol_cond_put(mpol);
516    put_mems_allowed();
517    return page;
518}
519
520static void update_and_free_page(struct hstate *h, struct page *page)
521{
522    int i;
523
524    VM_BUG_ON(h->order >= MAX_ORDER);
525
526    h->nr_huge_pages--;
527    h->nr_huge_pages_node[page_to_nid(page)]--;
528    for (i = 0; i < pages_per_huge_page(h); i++) {
529        page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
530                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
531                1 << PG_private | 1<< PG_writeback);
532    }
533    set_compound_page_dtor(page, NULL);
534    set_page_refcounted(page);
535    arch_release_hugepage(page);
536    __free_pages(page, huge_page_order(h));
537}
538
539struct hstate *size_to_hstate(unsigned long size)
540{
541    struct hstate *h;
542
543    for_each_hstate(h) {
544        if (huge_page_size(h) == size)
545            return h;
546    }
547    return NULL;
548}
549
550static void free_huge_page(struct page *page)
551{
552    /*
553     * Can't pass hstate in here because it is called from the
554     * compound page destructor.
555     */
556    struct hstate *h = page_hstate(page);
557    int nid = page_to_nid(page);
558    struct address_space *mapping;
559
560    mapping = (struct address_space *) page_private(page);
561    set_page_private(page, 0);
562    page->mapping = NULL;
563    BUG_ON(page_count(page));
564    BUG_ON(page_mapcount(page));
565    INIT_LIST_HEAD(&page->lru);
566
567    spin_lock(&hugetlb_lock);
568    if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
569        update_and_free_page(h, page);
570        h->surplus_huge_pages--;
571        h->surplus_huge_pages_node[nid]--;
572    } else {
573        enqueue_huge_page(h, page);
574    }
575    spin_unlock(&hugetlb_lock);
576    if (mapping)
577        hugetlb_put_quota(mapping, 1);
578}
579
580static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
581{
582    set_compound_page_dtor(page, free_huge_page);
583    spin_lock(&hugetlb_lock);
584    h->nr_huge_pages++;
585    h->nr_huge_pages_node[nid]++;
586    spin_unlock(&hugetlb_lock);
587    put_page(page); /* free it into the hugepage allocator */
588}
589
590static void prep_compound_gigantic_page(struct page *page, unsigned long order)
591{
592    int i;
593    int nr_pages = 1 << order;
594    struct page *p = page + 1;
595
596    /* we rely on prep_new_huge_page to set the destructor */
597    set_compound_order(page, order);
598    __SetPageHead(page);
599    for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
600        __SetPageTail(p);
601        p->first_page = page;
602    }
603}
604
605int PageHuge(struct page *page)
606{
607    compound_page_dtor *dtor;
608
609    if (!PageCompound(page))
610        return 0;
611
612    page = compound_head(page);
613    dtor = get_compound_page_dtor(page);
614
615    return dtor == free_huge_page;
616}
617
618EXPORT_SYMBOL_GPL(PageHuge);
619
620static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
621{
622    struct page *page;
623
624    if (h->order >= MAX_ORDER)
625        return NULL;
626
627    page = alloc_pages_exact_node(nid,
628        htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
629                        __GFP_REPEAT|__GFP_NOWARN,
630        huge_page_order(h));
631    if (page) {
632        if (arch_prepare_hugepage(page)) {
633            __free_pages(page, huge_page_order(h));
634            return NULL;
635        }
636        prep_new_huge_page(h, page, nid);
637    }
638
639    return page;
640}
641
642/*
643 * common helper functions for hstate_next_node_to_{alloc|free}.
644 * We may have allocated or freed a huge page based on a different
645 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
646 * be outside of *nodes_allowed. Ensure that we use an allowed
647 * node for alloc or free.
648 */
649static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
650{
651    nid = next_node(nid, *nodes_allowed);
652    if (nid == MAX_NUMNODES)
653        nid = first_node(*nodes_allowed);
654    VM_BUG_ON(nid >= MAX_NUMNODES);
655
656    return nid;
657}
658
659static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
660{
661    if (!node_isset(nid, *nodes_allowed))
662        nid = next_node_allowed(nid, nodes_allowed);
663    return nid;
664}
665
666/*
667 * returns the previously saved node ["this node"] from which to
668 * allocate a persistent huge page for the pool and advance the
669 * next node from which to allocate, handling wrap at end of node
670 * mask.
671 */
672static int hstate_next_node_to_alloc(struct hstate *h,
673                    nodemask_t *nodes_allowed)
674{
675    int nid;
676
677    VM_BUG_ON(!nodes_allowed);
678
679    nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
680    h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
681
682    return nid;
683}
684
685static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
686{
687    struct page *page;
688    int start_nid;
689    int next_nid;
690    int ret = 0;
691
692    start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
693    next_nid = start_nid;
694
695    do {
696        page = alloc_fresh_huge_page_node(h, next_nid);
697        if (page) {
698            ret = 1;
699            break;
700        }
701        next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
702    } while (next_nid != start_nid);
703
704    if (ret)
705        count_vm_event(HTLB_BUDDY_PGALLOC);
706    else
707        count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
708
709    return ret;
710}
711
712/*
713 * helper for free_pool_huge_page() - return the previously saved
714 * node ["this node"] from which to free a huge page. Advance the
715 * next node id whether or not we find a free huge page to free so
716 * that the next attempt to free addresses the next node.
717 */
718static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
719{
720    int nid;
721
722    VM_BUG_ON(!nodes_allowed);
723
724    nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
725    h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
726
727    return nid;
728}
729
730/*
731 * Free huge page from pool from next node to free.
732 * Attempt to keep persistent huge pages more or less
733 * balanced over allowed nodes.
734 * Called with hugetlb_lock locked.
735 */
736static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
737                             bool acct_surplus)
738{
739    int start_nid;
740    int next_nid;
741    int ret = 0;
742
743    start_nid = hstate_next_node_to_free(h, nodes_allowed);
744    next_nid = start_nid;
745
746    do {
747        /*
748         * If we're returning unused surplus pages, only examine
749         * nodes with surplus pages.
750         */
751        if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
752            !list_empty(&h->hugepage_freelists[next_nid])) {
753            struct page *page =
754                list_entry(h->hugepage_freelists[next_nid].next,
755                      struct page, lru);
756            list_del(&page->lru);
757            h->free_huge_pages--;
758            h->free_huge_pages_node[next_nid]--;
759            if (acct_surplus) {
760                h->surplus_huge_pages--;
761                h->surplus_huge_pages_node[next_nid]--;
762            }
763            update_and_free_page(h, page);
764            ret = 1;
765            break;
766        }
767        next_nid = hstate_next_node_to_free(h, nodes_allowed);
768    } while (next_nid != start_nid);
769
770    return ret;
771}
772
773static struct page *alloc_buddy_huge_page(struct hstate *h,
774            struct vm_area_struct *vma, unsigned long address)
775{
776    struct page *page;
777    unsigned int nid;
778
779    if (h->order >= MAX_ORDER)
780        return NULL;
781
782    /*
783     * Assume we will successfully allocate the surplus page to
784     * prevent racing processes from causing the surplus to exceed
785     * overcommit
786     *
787     * This however introduces a different race, where a process B
788     * tries to grow the static hugepage pool while alloc_pages() is
789     * called by process A. B will only examine the per-node
790     * counters in determining if surplus huge pages can be
791     * converted to normal huge pages in adjust_pool_surplus(). A
792     * won't be able to increment the per-node counter, until the
793     * lock is dropped by B, but B doesn't drop hugetlb_lock until
794     * no more huge pages can be converted from surplus to normal
795     * state (and doesn't try to convert again). Thus, we have a
796     * case where a surplus huge page exists, the pool is grown, and
797     * the surplus huge page still exists after, even though it
798     * should just have been converted to a normal huge page. This
799     * does not leak memory, though, as the hugepage will be freed
800     * once it is out of use. It also does not allow the counters to
801     * go out of whack in adjust_pool_surplus() as we don't modify
802     * the node values until we've gotten the hugepage and only the
803     * per-node value is checked there.
804     */
805    spin_lock(&hugetlb_lock);
806    if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
807        spin_unlock(&hugetlb_lock);
808        return NULL;
809    } else {
810        h->nr_huge_pages++;
811        h->surplus_huge_pages++;
812    }
813    spin_unlock(&hugetlb_lock);
814
815    page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
816                    __GFP_REPEAT|__GFP_NOWARN,
817                    huge_page_order(h));
818
819    if (page && arch_prepare_hugepage(page)) {
820        __free_pages(page, huge_page_order(h));
821        return NULL;
822    }
823
824    spin_lock(&hugetlb_lock);
825    if (page) {
826        /*
827         * This page is now managed by the hugetlb allocator and has
828         * no users -- drop the buddy allocator's reference.
829         */
830        put_page_testzero(page);
831        VM_BUG_ON(page_count(page));
832        nid = page_to_nid(page);
833        set_compound_page_dtor(page, free_huge_page);
834        /*
835         * We incremented the global counters already
836         */
837        h->nr_huge_pages_node[nid]++;
838        h->surplus_huge_pages_node[nid]++;
839        __count_vm_event(HTLB_BUDDY_PGALLOC);
840    } else {
841        h->nr_huge_pages--;
842        h->surplus_huge_pages--;
843        __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
844    }
845    spin_unlock(&hugetlb_lock);
846
847    return page;
848}
849
850/*
851 * Increase the hugetlb pool such that it can accomodate a reservation
852 * of size 'delta'.
853 */
854static int gather_surplus_pages(struct hstate *h, int delta)
855{
856    struct list_head surplus_list;
857    struct page *page, *tmp;
858    int ret, i;
859    int needed, allocated;
860
861    needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
862    if (needed <= 0) {
863        h->resv_huge_pages += delta;
864        return 0;
865    }
866
867    allocated = 0;
868    INIT_LIST_HEAD(&surplus_list);
869
870    ret = -ENOMEM;
871retry:
872    spin_unlock(&hugetlb_lock);
873    for (i = 0; i < needed; i++) {
874        page = alloc_buddy_huge_page(h, NULL, 0);
875        if (!page) {
876            /*
877             * We were not able to allocate enough pages to
878             * satisfy the entire reservation so we free what
879             * we've allocated so far.
880             */
881            spin_lock(&hugetlb_lock);
882            needed = 0;
883            goto free;
884        }
885
886        list_add(&page->lru, &surplus_list);
887    }
888    allocated += needed;
889
890    /*
891     * After retaking hugetlb_lock, we need to recalculate 'needed'
892     * because either resv_huge_pages or free_huge_pages may have changed.
893     */
894    spin_lock(&hugetlb_lock);
895    needed = (h->resv_huge_pages + delta) -
896            (h->free_huge_pages + allocated);
897    if (needed > 0)
898        goto retry;
899
900    /*
901     * The surplus_list now contains _at_least_ the number of extra pages
902     * needed to accomodate the reservation. Add the appropriate number
903     * of pages to the hugetlb pool and free the extras back to the buddy
904     * allocator. Commit the entire reservation here to prevent another
905     * process from stealing the pages as they are added to the pool but
906     * before they are reserved.
907     */
908    needed += allocated;
909    h->resv_huge_pages += delta;
910    ret = 0;
911free:
912    /* Free the needed pages to the hugetlb pool */
913    list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
914        if ((--needed) < 0)
915            break;
916        list_del(&page->lru);
917        enqueue_huge_page(h, page);
918    }
919
920    /* Free unnecessary surplus pages to the buddy allocator */
921    if (!list_empty(&surplus_list)) {
922        spin_unlock(&hugetlb_lock);
923        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
924            list_del(&page->lru);
925            /*
926             * The page has a reference count of zero already, so
927             * call free_huge_page directly instead of using
928             * put_page. This must be done with hugetlb_lock
929             * unlocked which is safe because free_huge_page takes
930             * hugetlb_lock before deciding how to free the page.
931             */
932            free_huge_page(page);
933        }
934        spin_lock(&hugetlb_lock);
935    }
936
937    return ret;
938}
939
940/*
941 * When releasing a hugetlb pool reservation, any surplus pages that were
942 * allocated to satisfy the reservation must be explicitly freed if they were
943 * never used.
944 * Called with hugetlb_lock held.
945 */
946static void return_unused_surplus_pages(struct hstate *h,
947                    unsigned long unused_resv_pages)
948{
949    unsigned long nr_pages;
950
951    /* Uncommit the reservation */
952    h->resv_huge_pages -= unused_resv_pages;
953
954    /* Cannot return gigantic pages currently */
955    if (h->order >= MAX_ORDER)
956        return;
957
958    nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
959
960    /*
961     * We want to release as many surplus pages as possible, spread
962     * evenly across all nodes with memory. Iterate across these nodes
963     * until we can no longer free unreserved surplus pages. This occurs
964     * when the nodes with surplus pages have no free pages.
965     * free_pool_huge_page() will balance the the freed pages across the
966     * on-line nodes with memory and will handle the hstate accounting.
967     */
968    while (nr_pages--) {
969        if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
970            break;
971    }
972}
973
974/*
975 * Determine if the huge page at addr within the vma has an associated
976 * reservation. Where it does not we will need to logically increase
977 * reservation and actually increase quota before an allocation can occur.
978 * Where any new reservation would be required the reservation change is
979 * prepared, but not committed. Once the page has been quota'd allocated
980 * an instantiated the change should be committed via vma_commit_reservation.
981 * No action is required on failure.
982 */
983static long vma_needs_reservation(struct hstate *h,
984            struct vm_area_struct *vma, unsigned long addr)
985{
986    struct address_space *mapping = vma->vm_file->f_mapping;
987    struct inode *inode = mapping->host;
988
989    if (vma->vm_flags & VM_MAYSHARE) {
990        pgoff_t idx = vma_hugecache_offset(h, vma, addr);
991        return region_chg(&inode->i_mapping->private_list,
992                            idx, idx + 1);
993
994    } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
995        return 1;
996
997    } else {
998        long err;
999        pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1000        struct resv_map *reservations = vma_resv_map(vma);
1001
1002        err = region_chg(&reservations->regions, idx, idx + 1);
1003        if (err < 0)
1004            return err;
1005        return 0;
1006    }
1007}
1008static void vma_commit_reservation(struct hstate *h,
1009            struct vm_area_struct *vma, unsigned long addr)
1010{
1011    struct address_space *mapping = vma->vm_file->f_mapping;
1012    struct inode *inode = mapping->host;
1013
1014    if (vma->vm_flags & VM_MAYSHARE) {
1015        pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1016        region_add(&inode->i_mapping->private_list, idx, idx + 1);
1017
1018    } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1019        pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1020        struct resv_map *reservations = vma_resv_map(vma);
1021
1022        /* Mark this page used in the map. */
1023        region_add(&reservations->regions, idx, idx + 1);
1024    }
1025}
1026
1027static struct page *alloc_huge_page(struct vm_area_struct *vma,
1028                    unsigned long addr, int avoid_reserve)
1029{
1030    struct hstate *h = hstate_vma(vma);
1031    struct page *page;
1032    struct address_space *mapping = vma->vm_file->f_mapping;
1033    struct inode *inode = mapping->host;
1034    long chg;
1035
1036    /*
1037     * Processes that did not create the mapping will have no reserves and
1038     * will not have accounted against quota. Check that the quota can be
1039     * made before satisfying the allocation
1040     * MAP_NORESERVE mappings may also need pages and quota allocated
1041     * if no reserve mapping overlaps.
1042     */
1043    chg = vma_needs_reservation(h, vma, addr);
1044    if (chg < 0)
1045        return ERR_PTR(chg);
1046    if (chg)
1047        if (hugetlb_get_quota(inode->i_mapping, chg))
1048            return ERR_PTR(-ENOSPC);
1049
1050    spin_lock(&hugetlb_lock);
1051    page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1052    spin_unlock(&hugetlb_lock);
1053
1054    if (!page) {
1055        page = alloc_buddy_huge_page(h, vma, addr);
1056        if (!page) {
1057            hugetlb_put_quota(inode->i_mapping, chg);
1058            return ERR_PTR(-VM_FAULT_SIGBUS);
1059        }
1060    }
1061
1062    set_page_refcounted(page);
1063    set_page_private(page, (unsigned long) mapping);
1064
1065    vma_commit_reservation(h, vma, addr);
1066
1067    return page;
1068}
1069
1070int __weak alloc_bootmem_huge_page(struct hstate *h)
1071{
1072    struct huge_bootmem_page *m;
1073    int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
1074
1075    while (nr_nodes) {
1076        void *addr;
1077
1078        addr = __alloc_bootmem_node_nopanic(
1079                NODE_DATA(hstate_next_node_to_alloc(h,
1080                        &node_states[N_HIGH_MEMORY])),
1081                huge_page_size(h), huge_page_size(h), 0);
1082
1083        if (addr) {
1084            /*
1085             * Use the beginning of the huge page to store the
1086             * huge_bootmem_page struct (until gather_bootmem
1087             * puts them into the mem_map).
1088             */
1089            m = addr;
1090            goto found;
1091        }
1092        nr_nodes--;
1093    }
1094    return 0;
1095
1096found:
1097    BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
1098    /* Put them into a private list first because mem_map is not up yet */
1099    list_add(&m->list, &huge_boot_pages);
1100    m->hstate = h;
1101    return 1;
1102}
1103
1104static void prep_compound_huge_page(struct page *page, int order)
1105{
1106    if (unlikely(order > (MAX_ORDER - 1)))
1107        prep_compound_gigantic_page(page, order);
1108    else
1109        prep_compound_page(page, order);
1110}
1111
1112/* Put bootmem huge pages into the standard lists after mem_map is up */
1113static void __init gather_bootmem_prealloc(void)
1114{
1115    struct huge_bootmem_page *m;
1116
1117    list_for_each_entry(m, &huge_boot_pages, list) {
1118        struct page *page = virt_to_page(m);
1119        struct hstate *h = m->hstate;
1120        __ClearPageReserved(page);
1121        WARN_ON(page_count(page) != 1);
1122        prep_compound_huge_page(page, h->order);
1123        prep_new_huge_page(h, page, page_to_nid(page));
1124    }
1125}
1126
1127static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
1128{
1129    unsigned long i;
1130
1131    for (i = 0; i < h->max_huge_pages; ++i) {
1132        if (h->order >= MAX_ORDER) {
1133            if (!alloc_bootmem_huge_page(h))
1134                break;
1135        } else if (!alloc_fresh_huge_page(h,
1136                     &node_states[N_HIGH_MEMORY]))
1137            break;
1138    }
1139    h->max_huge_pages = i;
1140}
1141
1142static void __init hugetlb_init_hstates(void)
1143{
1144    struct hstate *h;
1145
1146    for_each_hstate(h) {
1147        /* oversize hugepages were init'ed in early boot */
1148        if (h->order < MAX_ORDER)
1149            hugetlb_hstate_alloc_pages(h);
1150    }
1151}
1152
1153static char * __init memfmt(char *buf, unsigned long n)
1154{
1155    if (n >= (1UL << 30))
1156        sprintf(buf, "%lu GB", n >> 30);
1157    else if (n >= (1UL << 20))
1158        sprintf(buf, "%lu MB", n >> 20);
1159    else
1160        sprintf(buf, "%lu KB", n >> 10);
1161    return buf;
1162}
1163
1164static void __init report_hugepages(void)
1165{
1166    struct hstate *h;
1167
1168    for_each_hstate(h) {
1169        char buf[32];
1170        printk(KERN_INFO "HugeTLB registered %s page size, "
1171                 "pre-allocated %ld pages\n",
1172            memfmt(buf, huge_page_size(h)),
1173            h->free_huge_pages);
1174    }
1175}
1176
1177#ifdef CONFIG_HIGHMEM
1178static void try_to_free_low(struct hstate *h, unsigned long count,
1179                        nodemask_t *nodes_allowed)
1180{
1181    int i;
1182
1183    if (h->order >= MAX_ORDER)
1184        return;
1185
1186    for_each_node_mask(i, *nodes_allowed) {
1187        struct page *page, *next;
1188        struct list_head *freel = &h->hugepage_freelists[i];
1189        list_for_each_entry_safe(page, next, freel, lru) {
1190            if (count >= h->nr_huge_pages)
1191                return;
1192            if (PageHighMem(page))
1193                continue;
1194            list_del(&page->lru);
1195            update_and_free_page(h, page);
1196            h->free_huge_pages--;
1197            h->free_huge_pages_node[page_to_nid(page)]--;
1198        }
1199    }
1200}
1201#else
1202static inline void try_to_free_low(struct hstate *h, unsigned long count,
1203                        nodemask_t *nodes_allowed)
1204{
1205}
1206#endif
1207
1208/*
1209 * Increment or decrement surplus_huge_pages. Keep node-specific counters
1210 * balanced by operating on them in a round-robin fashion.
1211 * Returns 1 if an adjustment was made.
1212 */
1213static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1214                int delta)
1215{
1216    int start_nid, next_nid;
1217    int ret = 0;
1218
1219    VM_BUG_ON(delta != -1 && delta != 1);
1220
1221    if (delta < 0)
1222        start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
1223    else
1224        start_nid = hstate_next_node_to_free(h, nodes_allowed);
1225    next_nid = start_nid;
1226
1227    do {
1228        int nid = next_nid;
1229        if (delta < 0) {
1230            /*
1231             * To shrink on this node, there must be a surplus page
1232             */
1233            if (!h->surplus_huge_pages_node[nid]) {
1234                next_nid = hstate_next_node_to_alloc(h,
1235                                nodes_allowed);
1236                continue;
1237            }
1238        }
1239        if (delta > 0) {
1240            /*
1241             * Surplus cannot exceed the total number of pages
1242             */
1243            if (h->surplus_huge_pages_node[nid] >=
1244                        h->nr_huge_pages_node[nid]) {
1245                next_nid = hstate_next_node_to_free(h,
1246                                nodes_allowed);
1247                continue;
1248            }
1249        }
1250
1251        h->surplus_huge_pages += delta;
1252        h->surplus_huge_pages_node[nid] += delta;
1253        ret = 1;
1254        break;
1255    } while (next_nid != start_nid);
1256
1257    return ret;
1258}
1259
1260#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
1261static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
1262                        nodemask_t *nodes_allowed)
1263{
1264    unsigned long min_count, ret;
1265
1266    if (h->order >= MAX_ORDER)
1267        return h->max_huge_pages;
1268
1269    /*
1270     * Increase the pool size
1271     * First take pages out of surplus state. Then make up the
1272     * remaining difference by allocating fresh huge pages.
1273     *
1274     * We might race with alloc_buddy_huge_page() here and be unable
1275     * to convert a surplus huge page to a normal huge page. That is
1276     * not critical, though, it just means the overall size of the
1277     * pool might be one hugepage larger than it needs to be, but
1278     * within all the constraints specified by the sysctls.
1279     */
1280    spin_lock(&hugetlb_lock);
1281    while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
1282        if (!adjust_pool_surplus(h, nodes_allowed, -1))
1283            break;
1284    }
1285
1286    while (count > persistent_huge_pages(h)) {
1287        /*
1288         * If this allocation races such that we no longer need the
1289         * page, free_huge_page will handle it by freeing the page
1290         * and reducing the surplus.
1291         */
1292        spin_unlock(&hugetlb_lock);
1293        ret = alloc_fresh_huge_page(h, nodes_allowed);
1294        spin_lock(&hugetlb_lock);
1295        if (!ret)
1296            goto out;
1297
1298        /* Bail for signals. Probably ctrl-c from user */
1299        if (signal_pending(current))
1300            goto out;
1301    }
1302
1303    /*
1304     * Decrease the pool size
1305     * First return free pages to the buddy allocator (being careful
1306     * to keep enough around to satisfy reservations). Then place
1307     * pages into surplus state as needed so the pool will shrink
1308     * to the desired size as pages become free.
1309     *
1310     * By placing pages into the surplus state independent of the
1311     * overcommit value, we are allowing the surplus pool size to
1312     * exceed overcommit. There are few sane options here. Since
1313     * alloc_buddy_huge_page() is checking the global counter,
1314     * though, we'll note that we're not allowed to exceed surplus
1315     * and won't grow the pool anywhere else. Not until one of the
1316     * sysctls are changed, or the surplus pages go out of use.
1317     */
1318    min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1319    min_count = max(count, min_count);
1320    try_to_free_low(h, min_count, nodes_allowed);
1321    while (min_count < persistent_huge_pages(h)) {
1322        if (!free_pool_huge_page(h, nodes_allowed, 0))
1323            break;
1324    }
1325    while (count < persistent_huge_pages(h)) {
1326        if (!adjust_pool_surplus(h, nodes_allowed, 1))
1327            break;
1328    }
1329out:
1330    ret = persistent_huge_pages(h);
1331    spin_unlock(&hugetlb_lock);
1332    return ret;
1333}
1334
1335#define HSTATE_ATTR_RO(_name) \
1336    static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
1337
1338#define HSTATE_ATTR(_name) \
1339    static struct kobj_attribute _name##_attr = \
1340        __ATTR(_name, 0644, _name##_show, _name##_store)
1341
1342static struct kobject *hugepages_kobj;
1343static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1344
1345static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
1346
1347static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
1348{
1349    int i;
1350
1351    for (i = 0; i < HUGE_MAX_HSTATE; i++)
1352        if (hstate_kobjs[i] == kobj) {
1353            if (nidp)
1354                *nidp = NUMA_NO_NODE;
1355            return &hstates[i];
1356        }
1357
1358    return kobj_to_node_hstate(kobj, nidp);
1359}
1360
1361static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1362                    struct kobj_attribute *attr, char *buf)
1363{
1364    struct hstate *h;
1365    unsigned long nr_huge_pages;
1366    int nid;
1367
1368    h = kobj_to_hstate(kobj, &nid);
1369    if (nid == NUMA_NO_NODE)
1370        nr_huge_pages = h->nr_huge_pages;
1371    else
1372        nr_huge_pages = h->nr_huge_pages_node[nid];
1373
1374    return sprintf(buf, "%lu\n", nr_huge_pages);
1375}
1376static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1377            struct kobject *kobj, struct kobj_attribute *attr,
1378            const char *buf, size_t len)
1379{
1380    int err;
1381    int nid;
1382    unsigned long count;
1383    struct hstate *h;
1384    NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1385
1386    err = strict_strtoul(buf, 10, &count);
1387    if (err)
1388        return 0;
1389
1390    h = kobj_to_hstate(kobj, &nid);
1391    if (nid == NUMA_NO_NODE) {
1392        /*
1393         * global hstate attribute
1394         */
1395        if (!(obey_mempolicy &&
1396                init_nodemask_of_mempolicy(nodes_allowed))) {
1397            NODEMASK_FREE(nodes_allowed);
1398            nodes_allowed = &node_states[N_HIGH_MEMORY];
1399        }
1400    } else if (nodes_allowed) {
1401        /*
1402         * per node hstate attribute: adjust count to global,
1403         * but restrict alloc/free to the specified node.
1404         */
1405        count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
1406        init_nodemask_of_node(nodes_allowed, nid);
1407    } else
1408        nodes_allowed = &node_states[N_HIGH_MEMORY];
1409
1410    h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
1411
1412    if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1413        NODEMASK_FREE(nodes_allowed);
1414
1415    return len;
1416}
1417
1418static ssize_t nr_hugepages_show(struct kobject *kobj,
1419                       struct kobj_attribute *attr, char *buf)
1420{
1421    return nr_hugepages_show_common(kobj, attr, buf);
1422}
1423
1424static ssize_t nr_hugepages_store(struct kobject *kobj,
1425           struct kobj_attribute *attr, const char *buf, size_t len)
1426{
1427    return nr_hugepages_store_common(false, kobj, attr, buf, len);
1428}
1429HSTATE_ATTR(nr_hugepages);
1430
1431#ifdef CONFIG_NUMA
1432
1433/*
1434 * hstate attribute for optionally mempolicy-based constraint on persistent
1435 * huge page alloc/free.
1436 */
1437static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
1438                       struct kobj_attribute *attr, char *buf)
1439{
1440    return nr_hugepages_show_common(kobj, attr, buf);
1441}
1442
1443static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
1444           struct kobj_attribute *attr, const char *buf, size_t len)
1445{
1446    return nr_hugepages_store_common(true, kobj, attr, buf, len);
1447}
1448HSTATE_ATTR(nr_hugepages_mempolicy);
1449#endif
1450
1451
1452static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1453                    struct kobj_attribute *attr, char *buf)
1454{
1455    struct hstate *h = kobj_to_hstate(kobj, NULL);
1456    return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1457}
1458static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1459        struct kobj_attribute *attr, const char *buf, size_t count)
1460{
1461    int err;
1462    unsigned long input;
1463    struct hstate *h = kobj_to_hstate(kobj, NULL);
1464
1465    err = strict_strtoul(buf, 10, &input);
1466    if (err)
1467        return 0;
1468
1469    spin_lock(&hugetlb_lock);
1470    h->nr_overcommit_huge_pages = input;
1471    spin_unlock(&hugetlb_lock);
1472
1473    return count;
1474}
1475HSTATE_ATTR(nr_overcommit_hugepages);
1476
1477static ssize_t free_hugepages_show(struct kobject *kobj,
1478                    struct kobj_attribute *attr, char *buf)
1479{
1480    struct hstate *h;
1481    unsigned long free_huge_pages;
1482    int nid;
1483
1484    h = kobj_to_hstate(kobj, &nid);
1485    if (nid == NUMA_NO_NODE)
1486        free_huge_pages = h->free_huge_pages;
1487    else
1488        free_huge_pages = h->free_huge_pages_node[nid];
1489
1490    return sprintf(buf, "%lu\n", free_huge_pages);
1491}
1492HSTATE_ATTR_RO(free_hugepages);
1493
1494static ssize_t resv_hugepages_show(struct kobject *kobj,
1495                    struct kobj_attribute *attr, char *buf)
1496{
1497    struct hstate *h = kobj_to_hstate(kobj, NULL);
1498    return sprintf(buf, "%lu\n", h->resv_huge_pages);
1499}
1500HSTATE_ATTR_RO(resv_hugepages);
1501
1502static ssize_t surplus_hugepages_show(struct kobject *kobj,
1503                    struct kobj_attribute *attr, char *buf)
1504{
1505    struct hstate *h;
1506    unsigned long surplus_huge_pages;
1507    int nid;
1508
1509    h = kobj_to_hstate(kobj, &nid);
1510    if (nid == NUMA_NO_NODE)
1511        surplus_huge_pages = h->surplus_huge_pages;
1512    else
1513        surplus_huge_pages = h->surplus_huge_pages_node[nid];
1514
1515    return sprintf(buf, "%lu\n", surplus_huge_pages);
1516}
1517HSTATE_ATTR_RO(surplus_hugepages);
1518
1519static struct attribute *hstate_attrs[] = {
1520    &nr_hugepages_attr.attr,
1521    &nr_overcommit_hugepages_attr.attr,
1522    &free_hugepages_attr.attr,
1523    &resv_hugepages_attr.attr,
1524    &surplus_hugepages_attr.attr,
1525#ifdef CONFIG_NUMA
1526    &nr_hugepages_mempolicy_attr.attr,
1527#endif
1528    NULL,
1529};
1530
1531static struct attribute_group hstate_attr_group = {
1532    .attrs = hstate_attrs,
1533};
1534
1535static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1536                    struct kobject **hstate_kobjs,
1537                    struct attribute_group *hstate_attr_group)
1538{
1539    int retval;
1540    int hi = h - hstates;
1541
1542    hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1543    if (!hstate_kobjs[hi])
1544        return -ENOMEM;
1545
1546    retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
1547    if (retval)
1548        kobject_put(hstate_kobjs[hi]);
1549
1550    return retval;
1551}
1552
1553static void __init hugetlb_sysfs_init(void)
1554{
1555    struct hstate *h;
1556    int err;
1557
1558    hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
1559    if (!hugepages_kobj)
1560        return;
1561
1562    for_each_hstate(h) {
1563        err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1564                     hstate_kobjs, &hstate_attr_group);
1565        if (err)
1566            printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
1567                                h->name);
1568    }
1569}
1570
1571#ifdef CONFIG_NUMA
1572
1573/*
1574 * node_hstate/s - associate per node hstate attributes, via their kobjects,
1575 * with node sysdevs in node_devices[] using a parallel array. The array
1576 * index of a node sysdev or _hstate == node id.
1577 * This is here to avoid any static dependency of the node sysdev driver, in
1578 * the base kernel, on the hugetlb module.
1579 */
1580struct node_hstate {
1581    struct kobject *hugepages_kobj;
1582    struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
1583};
1584struct node_hstate node_hstates[MAX_NUMNODES];
1585
1586/*
1587 * A subset of global hstate attributes for node sysdevs
1588 */
1589static struct attribute *per_node_hstate_attrs[] = {
1590    &nr_hugepages_attr.attr,
1591    &free_hugepages_attr.attr,
1592    &surplus_hugepages_attr.attr,
1593    NULL,
1594};
1595
1596static struct attribute_group per_node_hstate_attr_group = {
1597    .attrs = per_node_hstate_attrs,
1598};
1599
1600/*
1601 * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
1602 * Returns node id via non-NULL nidp.
1603 */
1604static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1605{
1606    int nid;
1607
1608    for (nid = 0; nid < nr_node_ids; nid++) {
1609        struct node_hstate *nhs = &node_hstates[nid];
1610        int i;
1611        for (i = 0; i < HUGE_MAX_HSTATE; i++)
1612            if (nhs->hstate_kobjs[i] == kobj) {
1613                if (nidp)
1614                    *nidp = nid;
1615                return &hstates[i];
1616            }
1617    }
1618
1619    BUG();
1620    return NULL;
1621}
1622
1623/*
1624 * Unregister hstate attributes from a single node sysdev.
1625 * No-op if no hstate attributes attached.
1626 */
1627void hugetlb_unregister_node(struct node *node)
1628{
1629    struct hstate *h;
1630    struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1631
1632    if (!nhs->hugepages_kobj)
1633        return; /* no hstate attributes */
1634
1635    for_each_hstate(h)
1636        if (nhs->hstate_kobjs[h - hstates]) {
1637            kobject_put(nhs->hstate_kobjs[h - hstates]);
1638            nhs->hstate_kobjs[h - hstates] = NULL;
1639        }
1640
1641    kobject_put(nhs->hugepages_kobj);
1642    nhs->hugepages_kobj = NULL;
1643}
1644
1645/*
1646 * hugetlb module exit: unregister hstate attributes from node sysdevs
1647 * that have them.
1648 */
1649static void hugetlb_unregister_all_nodes(void)
1650{
1651    int nid;
1652
1653    /*
1654     * disable node sysdev registrations.
1655     */
1656    register_hugetlbfs_with_node(NULL, NULL);
1657
1658    /*
1659     * remove hstate attributes from any nodes that have them.
1660     */
1661    for (nid = 0; nid < nr_node_ids; nid++)
1662        hugetlb_unregister_node(&node_devices[nid]);
1663}
1664
1665/*
1666 * Register hstate attributes for a single node sysdev.
1667 * No-op if attributes already registered.
1668 */
1669void hugetlb_register_node(struct node *node)
1670{
1671    struct hstate *h;
1672    struct node_hstate *nhs = &node_hstates[node->sysdev.id];
1673    int err;
1674
1675    if (nhs->hugepages_kobj)
1676        return; /* already allocated */
1677
1678    nhs->hugepages_kobj = kobject_create_and_add("hugepages",
1679                            &node->sysdev.kobj);
1680    if (!nhs->hugepages_kobj)
1681        return;
1682
1683    for_each_hstate(h) {
1684        err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
1685                        nhs->hstate_kobjs,
1686                        &per_node_hstate_attr_group);
1687        if (err) {
1688            printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
1689                    " for node %d\n",
1690                        h->name, node->sysdev.id);
1691            hugetlb_unregister_node(node);
1692            break;
1693        }
1694    }
1695}
1696
1697/*
1698 * hugetlb init time: register hstate attributes for all registered node
1699 * sysdevs of nodes that have memory. All on-line nodes should have
1700 * registered their associated sysdev by this time.
1701 */
1702static void hugetlb_register_all_nodes(void)
1703{
1704    int nid;
1705
1706    for_each_node_state(nid, N_HIGH_MEMORY) {
1707        struct node *node = &node_devices[nid];
1708        if (node->sysdev.id == nid)
1709            hugetlb_register_node(node);
1710    }
1711
1712    /*
1713     * Let the node sysdev driver know we're here so it can
1714     * [un]register hstate attributes on node hotplug.
1715     */
1716    register_hugetlbfs_with_node(hugetlb_register_node,
1717                     hugetlb_unregister_node);
1718}
1719#else /* !CONFIG_NUMA */
1720
1721static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
1722{
1723    BUG();
1724    if (nidp)
1725        *nidp = -1;
1726    return NULL;
1727}
1728
1729static void hugetlb_unregister_all_nodes(void) { }
1730
1731static void hugetlb_register_all_nodes(void) { }
1732
1733#endif
1734
1735static void __exit hugetlb_exit(void)
1736{
1737    struct hstate *h;
1738
1739    hugetlb_unregister_all_nodes();
1740
1741    for_each_hstate(h) {
1742        kobject_put(hstate_kobjs[h - hstates]);
1743    }
1744
1745    kobject_put(hugepages_kobj);
1746}
1747module_exit(hugetlb_exit);
1748
1749static int __init hugetlb_init(void)
1750{
1751    /* Some platform decide whether they support huge pages at boot
1752     * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
1753     * there is no such support
1754     */
1755    if (HPAGE_SHIFT == 0)
1756        return 0;
1757
1758    if (!size_to_hstate(default_hstate_size)) {
1759        default_hstate_size = HPAGE_SIZE;
1760        if (!size_to_hstate(default_hstate_size))
1761            hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1762    }
1763    default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
1764    if (default_hstate_max_huge_pages)
1765        default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1766
1767    hugetlb_init_hstates();
1768
1769    gather_bootmem_prealloc();
1770
1771    report_hugepages();
1772
1773    hugetlb_sysfs_init();
1774
1775    hugetlb_register_all_nodes();
1776
1777    return 0;
1778}
1779module_init(hugetlb_init);
1780
1781/* Should be called on processing a hugepagesz=... option */
1782void __init hugetlb_add_hstate(unsigned order)
1783{
1784    struct hstate *h;
1785    unsigned long i;
1786
1787    if (size_to_hstate(PAGE_SIZE << order)) {
1788        printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1789        return;
1790    }
1791    BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
1792    BUG_ON(order == 0);
1793    h = &hstates[max_hstate++];
1794    h->order = order;
1795    h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1796    h->nr_huge_pages = 0;
1797    h->free_huge_pages = 0;
1798    for (i = 0; i < MAX_NUMNODES; ++i)
1799        INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1800    h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1801    h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1802    snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1803                    huge_page_size(h)/1024);
1804
1805    parsed_hstate = h;
1806}
1807
1808static int __init hugetlb_nrpages_setup(char *s)
1809{
1810    unsigned long *mhp;
1811    static unsigned long *last_mhp;
1812
1813    /*
1814     * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
1815     * so this hugepages= parameter goes to the "default hstate".
1816     */
1817    if (!max_hstate)
1818        mhp = &default_hstate_max_huge_pages;
1819    else
1820        mhp = &parsed_hstate->max_huge_pages;
1821
1822    if (mhp == last_mhp) {
1823        printk(KERN_WARNING "hugepages= specified twice without "
1824            "interleaving hugepagesz=, ignoring\n");
1825        return 1;
1826    }
1827
1828    if (sscanf(s, "%lu", mhp) <= 0)
1829        *mhp = 0;
1830
1831    /*
1832     * Global state is always initialized later in hugetlb_init.
1833     * But we need to allocate >= MAX_ORDER hstates here early to still
1834     * use the bootmem allocator.
1835     */
1836    if (max_hstate && parsed_hstate->order >= MAX_ORDER)
1837        hugetlb_hstate_alloc_pages(parsed_hstate);
1838
1839    last_mhp = mhp;
1840
1841    return 1;
1842}
1843__setup("hugepages=", hugetlb_nrpages_setup);
1844
1845static int __init hugetlb_default_setup(char *s)
1846{
1847    default_hstate_size = memparse(s, &s);
1848    return 1;
1849}
1850__setup("default_hugepagesz=", hugetlb_default_setup);
1851
1852static unsigned int cpuset_mems_nr(unsigned int *array)
1853{
1854    int node;
1855    unsigned int nr = 0;
1856
1857    for_each_node_mask(node, cpuset_current_mems_allowed)
1858        nr += array[node];
1859
1860    return nr;
1861}
1862
1863#ifdef CONFIG_SYSCTL
1864static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1865             struct ctl_table *table, int write,
1866             void __user *buffer, size_t *length, loff_t *ppos)
1867{
1868    struct hstate *h = &default_hstate;
1869    unsigned long tmp;
1870
1871    if (!write)
1872        tmp = h->max_huge_pages;
1873
1874    table->data = &tmp;
1875    table->maxlen = sizeof(unsigned long);
1876    proc_doulongvec_minmax(table, write, buffer, length, ppos);
1877
1878    if (write) {
1879        NODEMASK_ALLOC(nodemask_t, nodes_allowed,
1880                        GFP_KERNEL | __GFP_NORETRY);
1881        if (!(obey_mempolicy &&
1882                   init_nodemask_of_mempolicy(nodes_allowed))) {
1883            NODEMASK_FREE(nodes_allowed);
1884            nodes_allowed = &node_states[N_HIGH_MEMORY];
1885        }
1886        h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
1887
1888        if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1889            NODEMASK_FREE(nodes_allowed);
1890    }
1891
1892    return 0;
1893}
1894
1895int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1896              void __user *buffer, size_t *length, loff_t *ppos)
1897{
1898
1899    return hugetlb_sysctl_handler_common(false, table, write,
1900                            buffer, length, ppos);
1901}
1902
1903#ifdef CONFIG_NUMA
1904int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
1905              void __user *buffer, size_t *length, loff_t *ppos)
1906{
1907    return hugetlb_sysctl_handler_common(true, table, write,
1908                            buffer, length, ppos);
1909}
1910#endif /* CONFIG_NUMA */
1911
1912int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1913            void __user *buffer,
1914            size_t *length, loff_t *ppos)
1915{
1916    proc_dointvec(table, write, buffer, length, ppos);
1917    if (hugepages_treat_as_movable)
1918        htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1919    else
1920        htlb_alloc_mask = GFP_HIGHUSER;
1921    return 0;
1922}
1923
1924int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1925            void __user *buffer,
1926            size_t *length, loff_t *ppos)
1927{
1928    struct hstate *h = &default_hstate;
1929    unsigned long tmp;
1930
1931    if (!write)
1932        tmp = h->nr_overcommit_huge_pages;
1933
1934    table->data = &tmp;
1935    table->maxlen = sizeof(unsigned long);
1936    proc_doulongvec_minmax(table, write, buffer, length, ppos);
1937
1938    if (write) {
1939        spin_lock(&hugetlb_lock);
1940        h->nr_overcommit_huge_pages = tmp;
1941        spin_unlock(&hugetlb_lock);
1942    }
1943
1944    return 0;
1945}
1946
1947#endif /* CONFIG_SYSCTL */
1948
1949void hugetlb_report_meminfo(struct seq_file *m)
1950{
1951    struct hstate *h = &default_hstate;
1952    seq_printf(m,
1953            "HugePages_Total: %5lu\n"
1954            "HugePages_Free: %5lu\n"
1955            "HugePages_Rsvd: %5lu\n"
1956            "HugePages_Surp: %5lu\n"
1957            "Hugepagesize: %8lu kB\n",
1958            h->nr_huge_pages,
1959            h->free_huge_pages,
1960            h->resv_huge_pages,
1961            h->surplus_huge_pages,
1962            1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
1963}
1964
1965int hugetlb_report_node_meminfo(int nid, char *buf)
1966{
1967    struct hstate *h = &default_hstate;
1968    return sprintf(buf,
1969        "Node %d HugePages_Total: %5u\n"
1970        "Node %d HugePages_Free: %5u\n"
1971        "Node %d HugePages_Surp: %5u\n",
1972        nid, h->nr_huge_pages_node[nid],
1973        nid, h->free_huge_pages_node[nid],
1974        nid, h->surplus_huge_pages_node[nid]);
1975}
1976
1977/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
1978unsigned long hugetlb_total_pages(void)
1979{
1980    struct hstate *h = &default_hstate;
1981    return h->nr_huge_pages * pages_per_huge_page(h);
1982}
1983
1984static int hugetlb_acct_memory(struct hstate *h, long delta)
1985{
1986    int ret = -ENOMEM;
1987
1988    spin_lock(&hugetlb_lock);
1989    /*
1990     * When cpuset is configured, it breaks the strict hugetlb page
1991     * reservation as the accounting is done on a global variable. Such
1992     * reservation is completely rubbish in the presence of cpuset because
1993     * the reservation is not checked against page availability for the
1994     * current cpuset. Application can still potentially OOM'ed by kernel
1995     * with lack of free htlb page in cpuset that the task is in.
1996     * Attempt to enforce strict accounting with cpuset is almost
1997     * impossible (or too ugly) because cpuset is too fluid that
1998     * task or memory node can be dynamically moved between cpusets.
1999     *
2000     * The change of semantics for shared hugetlb mapping with cpuset is
2001     * undesirable. However, in order to preserve some of the semantics,
2002     * we fall back to check against current free page availability as
2003     * a best attempt and hopefully to minimize the impact of changing
2004     * semantics that cpuset has.
2005     */
2006    if (delta > 0) {
2007        if (gather_surplus_pages(h, delta) < 0)
2008            goto out;
2009
2010        if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
2011            return_unused_surplus_pages(h, delta);
2012            goto out;
2013        }
2014    }
2015
2016    ret = 0;
2017    if (delta < 0)
2018        return_unused_surplus_pages(h, (unsigned long) -delta);
2019
2020out:
2021    spin_unlock(&hugetlb_lock);
2022    return ret;
2023}
2024
2025static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2026{
2027    struct resv_map *reservations = vma_resv_map(vma);
2028
2029    /*
2030     * This new VMA should share its siblings reservation map if present.
2031     * The VMA will only ever have a valid reservation map pointer where
2032     * it is being copied for another still existing VMA. As that VMA
2033     * has a reference to the reservation map it cannot dissappear until
2034     * after this open call completes. It is therefore safe to take a
2035     * new reference here without additional locking.
2036     */
2037    if (reservations)
2038        kref_get(&reservations->refs);
2039}
2040
2041static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2042{
2043    struct hstate *h = hstate_vma(vma);
2044    struct resv_map *reservations = vma_resv_map(vma);
2045    unsigned long reserve;
2046    unsigned long start;
2047    unsigned long end;
2048
2049    if (reservations) {
2050        start = vma_hugecache_offset(h, vma, vma->vm_start);
2051        end = vma_hugecache_offset(h, vma, vma->vm_end);
2052
2053        reserve = (end - start) -
2054            region_count(&reservations->regions, start, end);
2055
2056        kref_put(&reservations->refs, resv_map_release);
2057
2058        if (reserve) {
2059            hugetlb_acct_memory(h, -reserve);
2060            hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
2061        }
2062    }
2063}
2064
2065/*
2066 * We cannot handle pagefaults against hugetlb pages at all. They cause
2067 * handle_mm_fault() to try to instantiate regular-sized pages in the
2068 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
2069 * this far.
2070 */
2071static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2072{
2073    BUG();
2074    return 0;
2075}
2076
2077const struct vm_operations_struct hugetlb_vm_ops = {
2078    .fault = hugetlb_vm_op_fault,
2079    .open = hugetlb_vm_op_open,
2080    .close = hugetlb_vm_op_close,
2081};
2082
2083static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
2084                int writable)
2085{
2086    pte_t entry;
2087
2088    if (writable) {
2089        entry =
2090            pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
2091    } else {
2092        entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
2093    }
2094    entry = pte_mkyoung(entry);
2095    entry = pte_mkhuge(entry);
2096
2097    return entry;
2098}
2099
2100static void set_huge_ptep_writable(struct vm_area_struct *vma,
2101                   unsigned long address, pte_t *ptep)
2102{
2103    pte_t entry;
2104
2105    entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2106    if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
2107        update_mmu_cache(vma, address, ptep);
2108    }
2109}
2110
2111
2112int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2113                struct vm_area_struct *vma)
2114{
2115    pte_t *src_pte, *dst_pte, entry;
2116    struct page *ptepage;
2117    unsigned long addr;
2118    int cow;
2119    struct hstate *h = hstate_vma(vma);
2120    unsigned long sz = huge_page_size(h);
2121
2122    cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2123
2124    for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2125        src_pte = huge_pte_offset(src, addr);
2126        if (!src_pte)
2127            continue;
2128        dst_pte = huge_pte_alloc(dst, addr, sz);
2129        if (!dst_pte)
2130            goto nomem;
2131
2132        /* If the pagetables are shared don't copy or take references */
2133        if (dst_pte == src_pte)
2134            continue;
2135
2136        spin_lock(&dst->page_table_lock);
2137        spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
2138        if (!huge_pte_none(huge_ptep_get(src_pte))) {
2139            if (cow)
2140                huge_ptep_set_wrprotect(src, addr, src_pte);
2141            entry = huge_ptep_get(src_pte);
2142            ptepage = pte_page(entry);
2143            get_page(ptepage);
2144            page_dup_rmap(ptepage);
2145            set_huge_pte_at(dst, addr, dst_pte, entry);
2146        }
2147        spin_unlock(&src->page_table_lock);
2148        spin_unlock(&dst->page_table_lock);
2149    }
2150    return 0;
2151
2152nomem:
2153    return -ENOMEM;
2154}
2155
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{
2158    swp_entry_t swp;
2159
2160    if (huge_pte_none(pte) || pte_present(pte))
2161        return 0;
2162    swp = pte_to_swp_entry(pte);
2163    if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
2164        return 1;
2165    } else
2166        return 0;
2167}
2168
2169void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2170                unsigned long end, struct page *ref_page)
2171{
2172    struct mm_struct *mm = vma->vm_mm;
2173    unsigned long address;
2174    pte_t *ptep;
2175    pte_t pte;
2176    struct page *page;
2177    struct page *tmp;
2178    struct hstate *h = hstate_vma(vma);
2179    unsigned long sz = huge_page_size(h);
2180
2181    /*
2182     * A page gathering list, protected by per file i_mmap_lock. The
2183     * lock is used to avoid list corruption from multiple unmapping
2184     * of the same page since we are using page->lru.
2185     */
2186    LIST_HEAD(page_list);
2187
2188    WARN_ON(!is_vm_hugetlb_page(vma));
2189    BUG_ON(start & ~huge_page_mask(h));
2190    BUG_ON(end & ~huge_page_mask(h));
2191
2192    mmu_notifier_invalidate_range_start(mm, start, end);
2193    spin_lock(&mm->page_table_lock);
2194    for (address = start; address < end; address += sz) {
2195        ptep = huge_pte_offset(mm, address);
2196        if (!ptep)
2197            continue;
2198
2199        if (huge_pmd_unshare(mm, &address, ptep))
2200            continue;
2201
2202        /*
2203         * If a reference page is supplied, it is because a specific
2204         * page is being unmapped, not a range. Ensure the page we
2205         * are about to unmap is the actual page of interest.
2206         */
2207        if (ref_page) {
2208            pte = huge_ptep_get(ptep);
2209            if (huge_pte_none(pte))
2210                continue;
2211            page = pte_page(pte);
2212            if (page != ref_page)
2213                continue;
2214
2215            /*
2216             * Mark the VMA as having unmapped its page so that
2217             * future faults in this VMA will fail rather than
2218             * looking like data was lost
2219             */
2220            set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
2221        }
2222
2223        pte = huge_ptep_get_and_clear(mm, address, ptep);
2224        if (huge_pte_none(pte))
2225            continue;
2226
2227        /*
2228         * HWPoisoned hugepage is already unmapped and dropped reference
2229         */
2230        if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2231            continue;
2232
2233        page = pte_page(pte);
2234        if (pte_dirty(pte))
2235            set_page_dirty(page);
2236        list_add(&page->lru, &page_list);
2237    }
2238    spin_unlock(&mm->page_table_lock);
2239    flush_tlb_range(vma, start, end);
2240    mmu_notifier_invalidate_range_end(mm, start, end);
2241    list_for_each_entry_safe(page, tmp, &page_list, lru) {
2242        page_remove_rmap(page);
2243        list_del(&page->lru);
2244        put_page(page);
2245    }
2246}
2247
2248void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2249              unsigned long end, struct page *ref_page)
2250{
2251    spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
2252    __unmap_hugepage_range(vma, start, end, ref_page);
2253    spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
2254}
2255
2256/*
2257 * This is called when the original mapper is failing to COW a MAP_PRIVATE
2258 * mappping it owns the reserve page for. The intention is to unmap the page
2259 * from other VMAs and let the children be SIGKILLed if they are faulting the
2260 * same region.
2261 */
2262static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2263                struct page *page, unsigned long address)
2264{
2265    struct hstate *h = hstate_vma(vma);
2266    struct vm_area_struct *iter_vma;
2267    struct address_space *mapping;
2268    struct prio_tree_iter iter;
2269    pgoff_t pgoff;
2270
2271    /*
2272     * vm_pgoff is in PAGE_SIZE units, hence the different calculation
2273     * from page cache lookup which is in HPAGE_SIZE units.
2274     */
2275    address = address & huge_page_mask(h);
2276    pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
2277        + (vma->vm_pgoff >> PAGE_SHIFT);
2278    mapping = (struct address_space *)page_private(page);
2279
2280    /*
2281     * Take the mapping lock for the duration of the table walk. As
2282     * this mapping should be shared between all the VMAs,
2283     * __unmap_hugepage_range() is called as the lock is already held
2284     */
2285    spin_lock(&mapping->i_mmap_lock);
2286    vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
2287        /* Do not unmap the current VMA */
2288        if (iter_vma == vma)
2289            continue;
2290
2291        /*
2292         * Unmap the page from other VMAs without their own reserves.
2293         * They get marked to be SIGKILLed if they fault in these
2294         * areas. This is because a future no-page fault on this VMA
2295         * could insert a zeroed page instead of the data existing
2296         * from the time of fork. This would look like data corruption
2297         */
2298        if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2299            __unmap_hugepage_range(iter_vma,
2300                address, address + huge_page_size(h),
2301                page);
2302    }
2303    spin_unlock(&mapping->i_mmap_lock);
2304
2305    return 1;
2306}
2307
2308/*
2309 * Hugetlb_cow() should be called with page lock of the original hugepage held.
2310 */
2311static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2312            unsigned long address, pte_t *ptep, pte_t pte,
2313            struct page *pagecache_page)
2314{
2315    struct hstate *h = hstate_vma(vma);
2316    struct page *old_page, *new_page;
2317    int avoidcopy;
2318    int outside_reserve = 0;
2319
2320    old_page = pte_page(pte);
2321
2322retry_avoidcopy:
2323    /* If no-one else is actually using this page, avoid the copy
2324     * and just make the page writable */
2325    avoidcopy = (page_mapcount(old_page) == 1);
2326    if (avoidcopy) {
2327        if (PageAnon(old_page))
2328            page_move_anon_rmap(old_page, vma, address);
2329        set_huge_ptep_writable(vma, address, ptep);
2330        return 0;
2331    }
2332
2333    /*
2334     * If the process that created a MAP_PRIVATE mapping is about to
2335     * perform a COW due to a shared page count, attempt to satisfy
2336     * the allocation without using the existing reserves. The pagecache
2337     * page is used to determine if the reserve at this address was
2338     * consumed or not. If reserves were used, a partial faulted mapping
2339     * at the time of fork() could consume its reserves on COW instead
2340     * of the full address range.
2341     */
2342    if (!(vma->vm_flags & VM_MAYSHARE) &&
2343            is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2344            old_page != pagecache_page)
2345        outside_reserve = 1;
2346
2347    page_cache_get(old_page);
2348
2349    /* Drop page_table_lock as buddy allocator may be called */
2350    spin_unlock(&mm->page_table_lock);
2351    new_page = alloc_huge_page(vma, address, outside_reserve);
2352
2353    if (IS_ERR(new_page)) {
2354        page_cache_release(old_page);
2355
2356        /*
2357         * If a process owning a MAP_PRIVATE mapping fails to COW,
2358         * it is due to references held by a child and an insufficient
2359         * huge page pool. To guarantee the original mappers
2360         * reliability, unmap the page from child processes. The child
2361         * may get SIGKILLed if it later faults.
2362         */
2363        if (outside_reserve) {
2364            BUG_ON(huge_pte_none(pte));
2365            if (unmap_ref_private(mm, vma, old_page, address)) {
2366                BUG_ON(page_count(old_page) != 1);
2367                BUG_ON(huge_pte_none(pte));
2368                spin_lock(&mm->page_table_lock);
2369                goto retry_avoidcopy;
2370            }
2371            WARN_ON_ONCE(1);
2372        }
2373
2374        /* Caller expects lock to be held */
2375        spin_lock(&mm->page_table_lock);
2376        return -PTR_ERR(new_page);
2377    }
2378
2379    /*
2380     * When the original hugepage is shared one, it does not have
2381     * anon_vma prepared.
2382     */
2383    if (unlikely(anon_vma_prepare(vma)))
2384        return VM_FAULT_OOM;
2385
2386    copy_huge_page(new_page, old_page, address, vma);
2387    __SetPageUptodate(new_page);
2388
2389    /*
2390     * Retake the page_table_lock to check for racing updates
2391     * before the page tables are altered
2392     */
2393    spin_lock(&mm->page_table_lock);
2394    ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2395    if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2396        /* Break COW */
2397        mmu_notifier_invalidate_range_start(mm,
2398            address & huge_page_mask(h),
2399            (address & huge_page_mask(h)) + huge_page_size(h));
2400        huge_ptep_clear_flush(vma, address, ptep);
2401        set_huge_pte_at(mm, address, ptep,
2402                make_huge_pte(vma, new_page, 1));
2403        page_remove_rmap(old_page);
2404        hugepage_add_new_anon_rmap(new_page, vma, address);
2405        /* Make the old page be freed below */
2406        new_page = old_page;
2407        mmu_notifier_invalidate_range_end(mm,
2408            address & huge_page_mask(h),
2409            (address & huge_page_mask(h)) + huge_page_size(h));
2410    }
2411    page_cache_release(new_page);
2412    page_cache_release(old_page);
2413    return 0;
2414}
2415
2416/* Return the pagecache page at a given address within a VMA */
2417static struct page *hugetlbfs_pagecache_page(struct hstate *h,
2418            struct vm_area_struct *vma, unsigned long address)
2419{
2420    struct address_space *mapping;
2421    pgoff_t idx;
2422
2423    mapping = vma->vm_file->f_mapping;
2424    idx = vma_hugecache_offset(h, vma, address);
2425
2426    return find_lock_page(mapping, idx);
2427}
2428
2429/*
2430 * Return whether there is a pagecache page to back given address within VMA.
2431 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
2432 */
2433static bool hugetlbfs_pagecache_present(struct hstate *h,
2434            struct vm_area_struct *vma, unsigned long address)
2435{
2436    struct address_space *mapping;
2437    pgoff_t idx;
2438    struct page *page;
2439
2440    mapping = vma->vm_file->f_mapping;
2441    idx = vma_hugecache_offset(h, vma, address);
2442
2443    page = find_get_page(mapping, idx);
2444    if (page)
2445        put_page(page);
2446    return page != NULL;
2447}
2448
2449static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2450            unsigned long address, pte_t *ptep, unsigned int flags)
2451{
2452    struct hstate *h = hstate_vma(vma);
2453    int ret = VM_FAULT_SIGBUS;
2454    pgoff_t idx;
2455    unsigned long size;
2456    struct page *page;
2457    struct address_space *mapping;
2458    pte_t new_pte;
2459
2460    /*
2461     * Currently, we are forced to kill the process in the event the
2462     * original mapper has unmapped pages from the child due to a failed
2463     * COW. Warn that such a situation has occured as it may not be obvious
2464     */
2465    if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2466        printk(KERN_WARNING
2467            "PID %d killed due to inadequate hugepage pool\n",
2468            current->pid);
2469        return ret;
2470    }
2471
2472    mapping = vma->vm_file->f_mapping;
2473    idx = vma_hugecache_offset(h, vma, address);
2474
2475    /*
2476     * Use page lock to guard against racing truncation
2477     * before we get page_table_lock.
2478     */
2479retry:
2480    page = find_lock_page(mapping, idx);
2481    if (!page) {
2482        size = i_size_read(mapping->host) >> huge_page_shift(h);
2483        if (idx >= size)
2484            goto out;
2485        page = alloc_huge_page(vma, address, 0);
2486        if (IS_ERR(page)) {
2487            ret = -PTR_ERR(page);
2488            goto out;
2489        }
2490        clear_huge_page(page, address, huge_page_size(h));
2491        __SetPageUptodate(page);
2492
2493        if (vma->vm_flags & VM_MAYSHARE) {
2494            int err;
2495            struct inode *inode = mapping->host;
2496
2497            err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
2498            if (err) {
2499                put_page(page);
2500                if (err == -EEXIST)
2501                    goto retry;
2502                goto out;
2503            }
2504
2505            spin_lock(&inode->i_lock);
2506            inode->i_blocks += blocks_per_huge_page(h);
2507            spin_unlock(&inode->i_lock);
2508            page_dup_rmap(page);
2509        } else {
2510            lock_page(page);
2511            if (unlikely(anon_vma_prepare(vma))) {
2512                ret = VM_FAULT_OOM;
2513                goto backout_unlocked;
2514            }
2515            hugepage_add_new_anon_rmap(page, vma, address);
2516        }
2517    } else {
2518        page_dup_rmap(page);
2519    }
2520
2521    /*
2522     * Since memory error handler replaces pte into hwpoison swap entry
2523     * at the time of error handling, a process which reserved but not have
2524     * the mapping to the error hugepage does not have hwpoison swap entry.
2525     * So we need to block accesses from such a process by checking
2526     * PG_hwpoison bit here.
2527     */
2528    if (unlikely(PageHWPoison(page))) {
2529        ret = VM_FAULT_HWPOISON;
2530        goto backout_unlocked;
2531    }
2532
2533    /*
2534     * If we are going to COW a private mapping later, we examine the
2535     * pending reservations for this page now. This will ensure that
2536     * any allocations necessary to record that reservation occur outside
2537     * the spinlock.
2538     */
2539    if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
2540        if (vma_needs_reservation(h, vma, address) < 0) {
2541            ret = VM_FAULT_OOM;
2542            goto backout_unlocked;
2543        }
2544
2545    spin_lock(&mm->page_table_lock);
2546    size = i_size_read(mapping->host) >> huge_page_shift(h);
2547    if (idx >= size)
2548        goto backout;
2549
2550    ret = 0;
2551    if (!huge_pte_none(huge_ptep_get(ptep)))
2552        goto backout;
2553
2554    new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
2555                && (vma->vm_flags & VM_SHARED)));
2556    set_huge_pte_at(mm, address, ptep, new_pte);
2557
2558    if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
2559        /* Optimization, do the COW without a second fault */
2560        ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
2561    }
2562
2563    spin_unlock(&mm->page_table_lock);
2564    unlock_page(page);
2565out:
2566    return ret;
2567
2568backout:
2569    spin_unlock(&mm->page_table_lock);
2570backout_unlocked:
2571    unlock_page(page);
2572    put_page(page);
2573    goto out;
2574}
2575
2576int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2577            unsigned long address, unsigned int flags)
2578{
2579    pte_t *ptep;
2580    pte_t entry;
2581    int ret;
2582    struct page *page = NULL;
2583    struct page *pagecache_page = NULL;
2584    static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2585    struct hstate *h = hstate_vma(vma);
2586
2587    ptep = huge_pte_offset(mm, address);
2588    if (ptep) {
2589        entry = huge_ptep_get(ptep);
2590        if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2591            return VM_FAULT_HWPOISON;
2592    }
2593
2594    ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2595    if (!ptep)
2596        return VM_FAULT_OOM;
2597
2598    /*
2599     * Serialize hugepage allocation and instantiation, so that we don't
2600     * get spurious allocation failures if two CPUs race to instantiate
2601     * the same page in the page cache.
2602     */
2603    mutex_lock(&hugetlb_instantiation_mutex);
2604    entry = huge_ptep_get(ptep);
2605    if (huge_pte_none(entry)) {
2606        ret = hugetlb_no_page(mm, vma, address, ptep, flags);
2607        goto out_mutex;
2608    }
2609
2610    ret = 0;
2611
2612    /*
2613     * If we are going to COW the mapping later, we examine the pending
2614     * reservations for this page now. This will ensure that any
2615     * allocations necessary to record that reservation occur outside the
2616     * spinlock. For private mappings, we also lookup the pagecache
2617     * page now as it is used to determine if a reservation has been
2618     * consumed.
2619     */
2620    if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
2621        if (vma_needs_reservation(h, vma, address) < 0) {
2622            ret = VM_FAULT_OOM;
2623            goto out_mutex;
2624        }
2625
2626        if (!(vma->vm_flags & VM_MAYSHARE))
2627            pagecache_page = hugetlbfs_pagecache_page(h,
2628                                vma, address);
2629    }
2630
2631    /*
2632     * hugetlb_cow() requires page locks of pte_page(entry) and
2633     * pagecache_page, so here we need take the former one
2634     * when page != pagecache_page or !pagecache_page.
2635     * Note that locking order is always pagecache_page -> page,
2636     * so no worry about deadlock.
2637     */
2638    page = pte_page(entry);
2639    if (page != pagecache_page)
2640        lock_page(page);
2641
2642    spin_lock(&mm->page_table_lock);
2643    /* Check for a racing update before calling hugetlb_cow */
2644    if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
2645        goto out_page_table_lock;
2646
2647
2648    if (flags & FAULT_FLAG_WRITE) {
2649        if (!pte_write(entry)) {
2650            ret = hugetlb_cow(mm, vma, address, ptep, entry,
2651                            pagecache_page);
2652            goto out_page_table_lock;
2653        }
2654        entry = pte_mkdirty(entry);
2655    }
2656    entry = pte_mkyoung(entry);
2657    if (huge_ptep_set_access_flags(vma, address, ptep, entry,
2658                        flags & FAULT_FLAG_WRITE))
2659        update_mmu_cache(vma, address, ptep);
2660
2661out_page_table_lock:
2662    spin_unlock(&mm->page_table_lock);
2663
2664    if (pagecache_page) {
2665        unlock_page(pagecache_page);
2666        put_page(pagecache_page);
2667    }
2668    unlock_page(page);
2669
2670out_mutex:
2671    mutex_unlock(&hugetlb_instantiation_mutex);
2672
2673    return ret;
2674}
2675
2676/* Can be overriden by architectures */
2677__attribute__((weak)) struct page *
2678follow_huge_pud(struct mm_struct *mm, unsigned long address,
2679           pud_t *pud, int write)
2680{
2681    BUG();
2682    return NULL;
2683}
2684
2685int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2686            struct page **pages, struct vm_area_struct **vmas,
2687            unsigned long *position, int *length, int i,
2688            unsigned int flags)
2689{
2690    unsigned long pfn_offset;
2691    unsigned long vaddr = *position;
2692    int remainder = *length;
2693    struct hstate *h = hstate_vma(vma);
2694
2695    spin_lock(&mm->page_table_lock);
2696    while (vaddr < vma->vm_end && remainder) {
2697        pte_t *pte;
2698        int absent;
2699        struct page *page;
2700
2701        /*
2702         * Some archs (sparc64, sh*) have multiple pte_ts to
2703         * each hugepage. We have to make sure we get the
2704         * first, for the page indexing below to work.
2705         */
2706        pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2707        absent = !pte || huge_pte_none(huge_ptep_get(pte));
2708
2709        /*
2710         * When coredumping, it suits get_dump_page if we just return
2711         * an error where there's an empty slot with no huge pagecache
2712         * to back it. This way, we avoid allocating a hugepage, and
2713         * the sparse dumpfile avoids allocating disk blocks, but its
2714         * huge holes still show up with zeroes where they need to be.
2715         */
2716        if (absent && (flags & FOLL_DUMP) &&
2717            !hugetlbfs_pagecache_present(h, vma, vaddr)) {
2718            remainder = 0;
2719            break;
2720        }
2721
2722        if (absent ||
2723            ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2724            int ret;
2725
2726            spin_unlock(&mm->page_table_lock);
2727            ret = hugetlb_fault(mm, vma, vaddr,
2728                (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2729            spin_lock(&mm->page_table_lock);
2730            if (!(ret & VM_FAULT_ERROR))
2731                continue;
2732
2733            remainder = 0;
2734            break;
2735        }
2736
2737        pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
2738        page = pte_page(huge_ptep_get(pte));
2739same_page:
2740        if (pages) {
2741            pages[i] = mem_map_offset(page, pfn_offset);
2742            get_page(pages[i]);
2743        }
2744
2745        if (vmas)
2746            vmas[i] = vma;
2747
2748        vaddr += PAGE_SIZE;
2749        ++pfn_offset;
2750        --remainder;
2751        ++i;
2752        if (vaddr < vma->vm_end && remainder &&
2753                pfn_offset < pages_per_huge_page(h)) {
2754            /*
2755             * We use pfn_offset to avoid touching the pageframes
2756             * of this compound page.
2757             */
2758            goto same_page;
2759        }
2760    }
2761    spin_unlock(&mm->page_table_lock);
2762    *length = remainder;
2763    *position = vaddr;
2764
2765    return i ? i : -EFAULT;
2766}
2767
2768void hugetlb_change_protection(struct vm_area_struct *vma,
2769        unsigned long address, unsigned long end, pgprot_t newprot)
2770{
2771    struct mm_struct *mm = vma->vm_mm;
2772    unsigned long start = address;
2773    pte_t *ptep;
2774    pte_t pte;
2775    struct hstate *h = hstate_vma(vma);
2776
2777    BUG_ON(address >= end);
2778    flush_cache_range(vma, address, end);
2779
2780    spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
2781    spin_lock(&mm->page_table_lock);
2782    for (; address < end; address += huge_page_size(h)) {
2783        ptep = huge_pte_offset(mm, address);
2784        if (!ptep)
2785            continue;
2786        if (huge_pmd_unshare(mm, &address, ptep))
2787            continue;
2788        if (!huge_pte_none(huge_ptep_get(ptep))) {
2789            pte = huge_ptep_get_and_clear(mm, address, ptep);
2790            pte = pte_mkhuge(pte_modify(pte, newprot));
2791            set_huge_pte_at(mm, address, ptep, pte);
2792        }
2793    }
2794    spin_unlock(&mm->page_table_lock);
2795    spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
2796
2797    flush_tlb_range(vma, start, end);
2798}
2799
2800int hugetlb_reserve_pages(struct inode *inode,
2801                    long from, long to,
2802                    struct vm_area_struct *vma,
2803                    int acctflag)
2804{
2805    long ret, chg;
2806    struct hstate *h = hstate_inode(inode);
2807
2808    /*
2809     * Only apply hugepage reservation if asked. At fault time, an
2810     * attempt will be made for VM_NORESERVE to allocate a page
2811     * and filesystem quota without using reserves
2812     */
2813    if (acctflag & VM_NORESERVE)
2814        return 0;
2815
2816    /*
2817     * Shared mappings base their reservation on the number of pages that
2818     * are already allocated on behalf of the file. Private mappings need
2819     * to reserve the full area even if read-only as mprotect() may be
2820     * called to make the mapping read-write. Assume !vma is a shm mapping
2821     */
2822    if (!vma || vma->vm_flags & VM_MAYSHARE)
2823        chg = region_chg(&inode->i_mapping->private_list, from, to);
2824    else {
2825        struct resv_map *resv_map = resv_map_alloc();
2826        if (!resv_map)
2827            return -ENOMEM;
2828
2829        chg = to - from;
2830
2831        set_vma_resv_map(vma, resv_map);
2832        set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2833    }
2834
2835    if (chg < 0)
2836        return chg;
2837
2838    /* There must be enough filesystem quota for the mapping */
2839    if (hugetlb_get_quota(inode->i_mapping, chg))
2840        return -ENOSPC;
2841
2842    /*
2843     * Check enough hugepages are available for the reservation.
2844     * Hand back the quota if there are not
2845     */
2846    ret = hugetlb_acct_memory(h, chg);
2847    if (ret < 0) {
2848        hugetlb_put_quota(inode->i_mapping, chg);
2849        return ret;
2850    }
2851
2852    /*
2853     * Account for the reservations made. Shared mappings record regions
2854     * that have reservations as they are shared by multiple VMAs.
2855     * When the last VMA disappears, the region map says how much
2856     * the reservation was and the page cache tells how much of
2857     * the reservation was consumed. Private mappings are per-VMA and
2858     * only the consumed reservations are tracked. When the VMA
2859     * disappears, the original reservation is the VMA size and the
2860     * consumed reservations are stored in the map. Hence, nothing
2861     * else has to be done for private mappings here
2862     */
2863    if (!vma || vma->vm_flags & VM_MAYSHARE)
2864        region_add(&inode->i_mapping->private_list, from, to);
2865    return 0;
2866}
2867
2868void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2869{
2870    struct hstate *h = hstate_inode(inode);
2871    long chg = region_truncate(&inode->i_mapping->private_list, offset);
2872
2873    spin_lock(&inode->i_lock);
2874    inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2875    spin_unlock(&inode->i_lock);
2876
2877    hugetlb_put_quota(inode->i_mapping, (chg - freed));
2878    hugetlb_acct_memory(h, -(chg - freed));
2879}
2880
2881/*
2882 * This function is called from memory failure code.
2883 * Assume the caller holds page lock of the head page.
2884 */
2885void __isolate_hwpoisoned_huge_page(struct page *hpage)
2886{
2887    struct hstate *h = page_hstate(hpage);
2888    int nid = page_to_nid(hpage);
2889
2890    spin_lock(&hugetlb_lock);
2891    list_del(&hpage->lru);
2892    h->free_huge_pages--;
2893    h->free_huge_pages_node[nid]--;
2894    spin_unlock(&hugetlb_lock);
2895}
2896

Archive Download this file



interactive