Root/mm/mempolicy.c

1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/slab.h>
77#include <linux/string.h>
78#include <linux/module.h>
79#include <linux/nsproxy.h>
80#include <linux/interrupt.h>
81#include <linux/init.h>
82#include <linux/compat.h>
83#include <linux/swap.h>
84#include <linux/seq_file.h>
85#include <linux/proc_fs.h>
86#include <linux/migrate.h>
87#include <linux/ksm.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92#include <linux/mm_inline.h>
93
94#include <asm/tlbflush.h>
95#include <asm/uaccess.h>
96
97#include "internal.h"
98
99/* Internal flags */
100#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
101#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
102
103static struct kmem_cache *policy_cache;
104static struct kmem_cache *sn_cache;
105
106/* Highest zone. An specific allocation for a zone below that is not
107   policied. */
108enum zone_type policy_zone = 0;
109
110/*
111 * run-time system-wide default policy => local allocation
112 */
113struct mempolicy default_policy = {
114    .refcnt = ATOMIC_INIT(1), /* never free it */
115    .mode = MPOL_PREFERRED,
116    .flags = MPOL_F_LOCAL,
117};
118
119static const struct mempolicy_operations {
120    int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
121    /*
122     * If read-side task has no lock to protect task->mempolicy, write-side
123     * task will rebind the task->mempolicy by two step. The first step is
124     * setting all the newly nodes, and the second step is cleaning all the
125     * disallowed nodes. In this way, we can avoid finding no node to alloc
126     * page.
127     * If we have a lock to protect task->mempolicy in read-side, we do
128     * rebind directly.
129     *
130     * step:
131     * MPOL_REBIND_ONCE - do rebind work at once
132     * MPOL_REBIND_STEP1 - set all the newly nodes
133     * MPOL_REBIND_STEP2 - clean all the disallowed nodes
134     */
135    void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
136            enum mpol_rebind_step step);
137} mpol_ops[MPOL_MAX];
138
139/* Check that the nodemask contains at least one populated zone */
140static int is_valid_nodemask(const nodemask_t *nodemask)
141{
142    int nd, k;
143
144    for_each_node_mask(nd, *nodemask) {
145        struct zone *z;
146
147        for (k = 0; k <= policy_zone; k++) {
148            z = &NODE_DATA(nd)->node_zones[k];
149            if (z->present_pages > 0)
150                return 1;
151        }
152    }
153
154    return 0;
155}
156
157static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
158{
159    return pol->flags & MPOL_MODE_FLAGS;
160}
161
162static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
163                   const nodemask_t *rel)
164{
165    nodemask_t tmp;
166    nodes_fold(tmp, *orig, nodes_weight(*rel));
167    nodes_onto(*ret, tmp, *rel);
168}
169
170static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
171{
172    if (nodes_empty(*nodes))
173        return -EINVAL;
174    pol->v.nodes = *nodes;
175    return 0;
176}
177
178static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
179{
180    if (!nodes)
181        pol->flags |= MPOL_F_LOCAL; /* local allocation */
182    else if (nodes_empty(*nodes))
183        return -EINVAL; /* no allowed nodes */
184    else
185        pol->v.preferred_node = first_node(*nodes);
186    return 0;
187}
188
189static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
190{
191    if (!is_valid_nodemask(nodes))
192        return -EINVAL;
193    pol->v.nodes = *nodes;
194    return 0;
195}
196
197/*
198 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
199 * any, for the new policy. mpol_new() has already validated the nodes
200 * parameter with respect to the policy mode and flags. But, we need to
201 * handle an empty nodemask with MPOL_PREFERRED here.
202 *
203 * Must be called holding task's alloc_lock to protect task's mems_allowed
204 * and mempolicy. May also be called holding the mmap_semaphore for write.
205 */
206static int mpol_set_nodemask(struct mempolicy *pol,
207             const nodemask_t *nodes, struct nodemask_scratch *nsc)
208{
209    int ret;
210
211    /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
212    if (pol == NULL)
213        return 0;
214    /* Check N_HIGH_MEMORY */
215    nodes_and(nsc->mask1,
216          cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
217
218    VM_BUG_ON(!nodes);
219    if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
220        nodes = NULL; /* explicit local allocation */
221    else {
222        if (pol->flags & MPOL_F_RELATIVE_NODES)
223            mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
224        else
225            nodes_and(nsc->mask2, *nodes, nsc->mask1);
226
227        if (mpol_store_user_nodemask(pol))
228            pol->w.user_nodemask = *nodes;
229        else
230            pol->w.cpuset_mems_allowed =
231                        cpuset_current_mems_allowed;
232    }
233
234    if (nodes)
235        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
236    else
237        ret = mpol_ops[pol->mode].create(pol, NULL);
238    return ret;
239}
240
241/*
242 * This function just creates a new policy, does some check and simple
243 * initialization. You must invoke mpol_set_nodemask() to set nodes.
244 */
245static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
246                  nodemask_t *nodes)
247{
248    struct mempolicy *policy;
249
250    pr_debug("setting mode %d flags %d nodes[0] %lx\n",
251         mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
252
253    if (mode == MPOL_DEFAULT) {
254        if (nodes && !nodes_empty(*nodes))
255            return ERR_PTR(-EINVAL);
256        return NULL; /* simply delete any existing policy */
257    }
258    VM_BUG_ON(!nodes);
259
260    /*
261     * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
262     * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
263     * All other modes require a valid pointer to a non-empty nodemask.
264     */
265    if (mode == MPOL_PREFERRED) {
266        if (nodes_empty(*nodes)) {
267            if (((flags & MPOL_F_STATIC_NODES) ||
268                 (flags & MPOL_F_RELATIVE_NODES)))
269                return ERR_PTR(-EINVAL);
270        }
271    } else if (nodes_empty(*nodes))
272        return ERR_PTR(-EINVAL);
273    policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
274    if (!policy)
275        return ERR_PTR(-ENOMEM);
276    atomic_set(&policy->refcnt, 1);
277    policy->mode = mode;
278    policy->flags = flags;
279
280    return policy;
281}
282
283/* Slow path of a mpol destructor. */
284void __mpol_put(struct mempolicy *p)
285{
286    if (!atomic_dec_and_test(&p->refcnt))
287        return;
288    kmem_cache_free(policy_cache, p);
289}
290
291static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
292                enum mpol_rebind_step step)
293{
294}
295
296/*
297 * step:
298 * MPOL_REBIND_ONCE - do rebind work at once
299 * MPOL_REBIND_STEP1 - set all the newly nodes
300 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
301 */
302static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
303                 enum mpol_rebind_step step)
304{
305    nodemask_t tmp;
306
307    if (pol->flags & MPOL_F_STATIC_NODES)
308        nodes_and(tmp, pol->w.user_nodemask, *nodes);
309    else if (pol->flags & MPOL_F_RELATIVE_NODES)
310        mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
311    else {
312        /*
313         * if step == 1, we use ->w.cpuset_mems_allowed to cache the
314         * result
315         */
316        if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
317            nodes_remap(tmp, pol->v.nodes,
318                    pol->w.cpuset_mems_allowed, *nodes);
319            pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
320        } else if (step == MPOL_REBIND_STEP2) {
321            tmp = pol->w.cpuset_mems_allowed;
322            pol->w.cpuset_mems_allowed = *nodes;
323        } else
324            BUG();
325    }
326
327    if (nodes_empty(tmp))
328        tmp = *nodes;
329
330    if (step == MPOL_REBIND_STEP1)
331        nodes_or(pol->v.nodes, pol->v.nodes, tmp);
332    else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
333        pol->v.nodes = tmp;
334    else
335        BUG();
336
337    if (!node_isset(current->il_next, tmp)) {
338        current->il_next = next_node(current->il_next, tmp);
339        if (current->il_next >= MAX_NUMNODES)
340            current->il_next = first_node(tmp);
341        if (current->il_next >= MAX_NUMNODES)
342            current->il_next = numa_node_id();
343    }
344}
345
346static void mpol_rebind_preferred(struct mempolicy *pol,
347                  const nodemask_t *nodes,
348                  enum mpol_rebind_step step)
349{
350    nodemask_t tmp;
351
352    if (pol->flags & MPOL_F_STATIC_NODES) {
353        int node = first_node(pol->w.user_nodemask);
354
355        if (node_isset(node, *nodes)) {
356            pol->v.preferred_node = node;
357            pol->flags &= ~MPOL_F_LOCAL;
358        } else
359            pol->flags |= MPOL_F_LOCAL;
360    } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
361        mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
362        pol->v.preferred_node = first_node(tmp);
363    } else if (!(pol->flags & MPOL_F_LOCAL)) {
364        pol->v.preferred_node = node_remap(pol->v.preferred_node,
365                           pol->w.cpuset_mems_allowed,
366                           *nodes);
367        pol->w.cpuset_mems_allowed = *nodes;
368    }
369}
370
371/*
372 * mpol_rebind_policy - Migrate a policy to a different set of nodes
373 *
374 * If read-side task has no lock to protect task->mempolicy, write-side
375 * task will rebind the task->mempolicy by two step. The first step is
376 * setting all the newly nodes, and the second step is cleaning all the
377 * disallowed nodes. In this way, we can avoid finding no node to alloc
378 * page.
379 * If we have a lock to protect task->mempolicy in read-side, we do
380 * rebind directly.
381 *
382 * step:
383 * MPOL_REBIND_ONCE - do rebind work at once
384 * MPOL_REBIND_STEP1 - set all the newly nodes
385 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
386 */
387static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
388                enum mpol_rebind_step step)
389{
390    if (!pol)
391        return;
392    if (!mpol_store_user_nodemask(pol) && step == 0 &&
393        nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
394        return;
395
396    if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
397        return;
398
399    if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
400        BUG();
401
402    if (step == MPOL_REBIND_STEP1)
403        pol->flags |= MPOL_F_REBINDING;
404    else if (step == MPOL_REBIND_STEP2)
405        pol->flags &= ~MPOL_F_REBINDING;
406    else if (step >= MPOL_REBIND_NSTEP)
407        BUG();
408
409    mpol_ops[pol->mode].rebind(pol, newmask, step);
410}
411
412/*
413 * Wrapper for mpol_rebind_policy() that just requires task
414 * pointer, and updates task mempolicy.
415 *
416 * Called with task's alloc_lock held.
417 */
418
419void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
420            enum mpol_rebind_step step)
421{
422    mpol_rebind_policy(tsk->mempolicy, new, step);
423}
424
425/*
426 * Rebind each vma in mm to new nodemask.
427 *
428 * Call holding a reference to mm. Takes mm->mmap_sem during call.
429 */
430
431void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
432{
433    struct vm_area_struct *vma;
434
435    down_write(&mm->mmap_sem);
436    for (vma = mm->mmap; vma; vma = vma->vm_next)
437        mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
438    up_write(&mm->mmap_sem);
439}
440
441static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
442    [MPOL_DEFAULT] = {
443        .rebind = mpol_rebind_default,
444    },
445    [MPOL_INTERLEAVE] = {
446        .create = mpol_new_interleave,
447        .rebind = mpol_rebind_nodemask,
448    },
449    [MPOL_PREFERRED] = {
450        .create = mpol_new_preferred,
451        .rebind = mpol_rebind_preferred,
452    },
453    [MPOL_BIND] = {
454        .create = mpol_new_bind,
455        .rebind = mpol_rebind_nodemask,
456    },
457};
458
459static void migrate_page_add(struct page *page, struct list_head *pagelist,
460                unsigned long flags);
461
462/* Scan through pages checking if pages follow certain conditions. */
463static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
464        unsigned long addr, unsigned long end,
465        const nodemask_t *nodes, unsigned long flags,
466        void *private)
467{
468    pte_t *orig_pte;
469    pte_t *pte;
470    spinlock_t *ptl;
471
472    orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
473    do {
474        struct page *page;
475        int nid;
476
477        if (!pte_present(*pte))
478            continue;
479        page = vm_normal_page(vma, addr, *pte);
480        if (!page)
481            continue;
482        /*
483         * vm_normal_page() filters out zero pages, but there might
484         * still be PageReserved pages to skip, perhaps in a VDSO.
485         * And we cannot move PageKsm pages sensibly or safely yet.
486         */
487        if (PageReserved(page) || PageKsm(page))
488            continue;
489        nid = page_to_nid(page);
490        if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
491            continue;
492
493        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
494            migrate_page_add(page, private, flags);
495        else
496            break;
497    } while (pte++, addr += PAGE_SIZE, addr != end);
498    pte_unmap_unlock(orig_pte, ptl);
499    return addr != end;
500}
501
502static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
503        unsigned long addr, unsigned long end,
504        const nodemask_t *nodes, unsigned long flags,
505        void *private)
506{
507    pmd_t *pmd;
508    unsigned long next;
509
510    pmd = pmd_offset(pud, addr);
511    do {
512        next = pmd_addr_end(addr, end);
513        split_huge_page_pmd(vma->vm_mm, pmd);
514        if (pmd_none_or_clear_bad(pmd))
515            continue;
516        if (check_pte_range(vma, pmd, addr, next, nodes,
517                    flags, private))
518            return -EIO;
519    } while (pmd++, addr = next, addr != end);
520    return 0;
521}
522
523static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
524        unsigned long addr, unsigned long end,
525        const nodemask_t *nodes, unsigned long flags,
526        void *private)
527{
528    pud_t *pud;
529    unsigned long next;
530
531    pud = pud_offset(pgd, addr);
532    do {
533        next = pud_addr_end(addr, end);
534        if (pud_none_or_clear_bad(pud))
535            continue;
536        if (check_pmd_range(vma, pud, addr, next, nodes,
537                    flags, private))
538            return -EIO;
539    } while (pud++, addr = next, addr != end);
540    return 0;
541}
542
543static inline int check_pgd_range(struct vm_area_struct *vma,
544        unsigned long addr, unsigned long end,
545        const nodemask_t *nodes, unsigned long flags,
546        void *private)
547{
548    pgd_t *pgd;
549    unsigned long next;
550
551    pgd = pgd_offset(vma->vm_mm, addr);
552    do {
553        next = pgd_addr_end(addr, end);
554        if (pgd_none_or_clear_bad(pgd))
555            continue;
556        if (check_pud_range(vma, pgd, addr, next, nodes,
557                    flags, private))
558            return -EIO;
559    } while (pgd++, addr = next, addr != end);
560    return 0;
561}
562
563/*
564 * Check if all pages in a range are on a set of nodes.
565 * If pagelist != NULL then isolate pages from the LRU and
566 * put them on the pagelist.
567 */
568static struct vm_area_struct *
569check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
570        const nodemask_t *nodes, unsigned long flags, void *private)
571{
572    int err;
573    struct vm_area_struct *first, *vma, *prev;
574
575
576    first = find_vma(mm, start);
577    if (!first)
578        return ERR_PTR(-EFAULT);
579    prev = NULL;
580    for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
581        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
582            if (!vma->vm_next && vma->vm_end < end)
583                return ERR_PTR(-EFAULT);
584            if (prev && prev->vm_end < vma->vm_start)
585                return ERR_PTR(-EFAULT);
586        }
587        if (!is_vm_hugetlb_page(vma) &&
588            ((flags & MPOL_MF_STRICT) ||
589             ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
590                vma_migratable(vma)))) {
591            unsigned long endvma = vma->vm_end;
592
593            if (endvma > end)
594                endvma = end;
595            if (vma->vm_start > start)
596                start = vma->vm_start;
597            err = check_pgd_range(vma, start, endvma, nodes,
598                        flags, private);
599            if (err) {
600                first = ERR_PTR(err);
601                break;
602            }
603        }
604        prev = vma;
605    }
606    return first;
607}
608
609/* Apply policy to a single VMA */
610static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
611{
612    int err = 0;
613    struct mempolicy *old = vma->vm_policy;
614
615    pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
616         vma->vm_start, vma->vm_end, vma->vm_pgoff,
617         vma->vm_ops, vma->vm_file,
618         vma->vm_ops ? vma->vm_ops->set_policy : NULL);
619
620    if (vma->vm_ops && vma->vm_ops->set_policy)
621        err = vma->vm_ops->set_policy(vma, new);
622    if (!err) {
623        mpol_get(new);
624        vma->vm_policy = new;
625        mpol_put(old);
626    }
627    return err;
628}
629
630/* Step 2: apply policy to a range and do splits. */
631static int mbind_range(struct mm_struct *mm, unsigned long start,
632               unsigned long end, struct mempolicy *new_pol)
633{
634    struct vm_area_struct *next;
635    struct vm_area_struct *prev;
636    struct vm_area_struct *vma;
637    int err = 0;
638    pgoff_t pgoff;
639    unsigned long vmstart;
640    unsigned long vmend;
641
642    vma = find_vma_prev(mm, start, &prev);
643    if (!vma || vma->vm_start > start)
644        return -EFAULT;
645
646    for (; vma && vma->vm_start < end; prev = vma, vma = next) {
647        next = vma->vm_next;
648        vmstart = max(start, vma->vm_start);
649        vmend = min(end, vma->vm_end);
650
651        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
652        prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
653                  vma->anon_vma, vma->vm_file, pgoff, new_pol);
654        if (prev) {
655            vma = prev;
656            next = vma->vm_next;
657            continue;
658        }
659        if (vma->vm_start != vmstart) {
660            err = split_vma(vma->vm_mm, vma, vmstart, 1);
661            if (err)
662                goto out;
663        }
664        if (vma->vm_end != vmend) {
665            err = split_vma(vma->vm_mm, vma, vmend, 0);
666            if (err)
667                goto out;
668        }
669        err = policy_vma(vma, new_pol);
670        if (err)
671            goto out;
672    }
673
674 out:
675    return err;
676}
677
678/*
679 * Update task->flags PF_MEMPOLICY bit: set iff non-default
680 * mempolicy. Allows more rapid checking of this (combined perhaps
681 * with other PF_* flag bits) on memory allocation hot code paths.
682 *
683 * If called from outside this file, the task 'p' should -only- be
684 * a newly forked child not yet visible on the task list, because
685 * manipulating the task flags of a visible task is not safe.
686 *
687 * The above limitation is why this routine has the funny name
688 * mpol_fix_fork_child_flag().
689 *
690 * It is also safe to call this with a task pointer of current,
691 * which the static wrapper mpol_set_task_struct_flag() does,
692 * for use within this file.
693 */
694
695void mpol_fix_fork_child_flag(struct task_struct *p)
696{
697    if (p->mempolicy)
698        p->flags |= PF_MEMPOLICY;
699    else
700        p->flags &= ~PF_MEMPOLICY;
701}
702
703static void mpol_set_task_struct_flag(void)
704{
705    mpol_fix_fork_child_flag(current);
706}
707
708/* Set the process memory policy */
709static long do_set_mempolicy(unsigned short mode, unsigned short flags,
710                 nodemask_t *nodes)
711{
712    struct mempolicy *new, *old;
713    struct mm_struct *mm = current->mm;
714    NODEMASK_SCRATCH(scratch);
715    int ret;
716
717    if (!scratch)
718        return -ENOMEM;
719
720    new = mpol_new(mode, flags, nodes);
721    if (IS_ERR(new)) {
722        ret = PTR_ERR(new);
723        goto out;
724    }
725    /*
726     * prevent changing our mempolicy while show_numa_maps()
727     * is using it.
728     * Note: do_set_mempolicy() can be called at init time
729     * with no 'mm'.
730     */
731    if (mm)
732        down_write(&mm->mmap_sem);
733    task_lock(current);
734    ret = mpol_set_nodemask(new, nodes, scratch);
735    if (ret) {
736        task_unlock(current);
737        if (mm)
738            up_write(&mm->mmap_sem);
739        mpol_put(new);
740        goto out;
741    }
742    old = current->mempolicy;
743    current->mempolicy = new;
744    mpol_set_task_struct_flag();
745    if (new && new->mode == MPOL_INTERLEAVE &&
746        nodes_weight(new->v.nodes))
747        current->il_next = first_node(new->v.nodes);
748    task_unlock(current);
749    if (mm)
750        up_write(&mm->mmap_sem);
751
752    mpol_put(old);
753    ret = 0;
754out:
755    NODEMASK_SCRATCH_FREE(scratch);
756    return ret;
757}
758
759/*
760 * Return nodemask for policy for get_mempolicy() query
761 *
762 * Called with task's alloc_lock held
763 */
764static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
765{
766    nodes_clear(*nodes);
767    if (p == &default_policy)
768        return;
769
770    switch (p->mode) {
771    case MPOL_BIND:
772        /* Fall through */
773    case MPOL_INTERLEAVE:
774        *nodes = p->v.nodes;
775        break;
776    case MPOL_PREFERRED:
777        if (!(p->flags & MPOL_F_LOCAL))
778            node_set(p->v.preferred_node, *nodes);
779        /* else return empty node mask for local allocation */
780        break;
781    default:
782        BUG();
783    }
784}
785
786static int lookup_node(struct mm_struct *mm, unsigned long addr)
787{
788    struct page *p;
789    int err;
790
791    err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
792    if (err >= 0) {
793        err = page_to_nid(p);
794        put_page(p);
795    }
796    return err;
797}
798
799/* Retrieve NUMA policy */
800static long do_get_mempolicy(int *policy, nodemask_t *nmask,
801                 unsigned long addr, unsigned long flags)
802{
803    int err;
804    struct mm_struct *mm = current->mm;
805    struct vm_area_struct *vma = NULL;
806    struct mempolicy *pol = current->mempolicy;
807
808    if (flags &
809        ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
810        return -EINVAL;
811
812    if (flags & MPOL_F_MEMS_ALLOWED) {
813        if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
814            return -EINVAL;
815        *policy = 0; /* just so it's initialized */
816        task_lock(current);
817        *nmask = cpuset_current_mems_allowed;
818        task_unlock(current);
819        return 0;
820    }
821
822    if (flags & MPOL_F_ADDR) {
823        /*
824         * Do NOT fall back to task policy if the
825         * vma/shared policy at addr is NULL. We
826         * want to return MPOL_DEFAULT in this case.
827         */
828        down_read(&mm->mmap_sem);
829        vma = find_vma_intersection(mm, addr, addr+1);
830        if (!vma) {
831            up_read(&mm->mmap_sem);
832            return -EFAULT;
833        }
834        if (vma->vm_ops && vma->vm_ops->get_policy)
835            pol = vma->vm_ops->get_policy(vma, addr);
836        else
837            pol = vma->vm_policy;
838    } else if (addr)
839        return -EINVAL;
840
841    if (!pol)
842        pol = &default_policy; /* indicates default behavior */
843
844    if (flags & MPOL_F_NODE) {
845        if (flags & MPOL_F_ADDR) {
846            err = lookup_node(mm, addr);
847            if (err < 0)
848                goto out;
849            *policy = err;
850        } else if (pol == current->mempolicy &&
851                pol->mode == MPOL_INTERLEAVE) {
852            *policy = current->il_next;
853        } else {
854            err = -EINVAL;
855            goto out;
856        }
857    } else {
858        *policy = pol == &default_policy ? MPOL_DEFAULT :
859                        pol->mode;
860        /*
861         * Internal mempolicy flags must be masked off before exposing
862         * the policy to userspace.
863         */
864        *policy |= (pol->flags & MPOL_MODE_FLAGS);
865    }
866
867    if (vma) {
868        up_read(&current->mm->mmap_sem);
869        vma = NULL;
870    }
871
872    err = 0;
873    if (nmask) {
874        if (mpol_store_user_nodemask(pol)) {
875            *nmask = pol->w.user_nodemask;
876        } else {
877            task_lock(current);
878            get_policy_nodemask(pol, nmask);
879            task_unlock(current);
880        }
881    }
882
883 out:
884    mpol_cond_put(pol);
885    if (vma)
886        up_read(&current->mm->mmap_sem);
887    return err;
888}
889
890#ifdef CONFIG_MIGRATION
891/*
892 * page migration
893 */
894static void migrate_page_add(struct page *page, struct list_head *pagelist,
895                unsigned long flags)
896{
897    /*
898     * Avoid migrating a page that is shared with others.
899     */
900    if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
901        if (!isolate_lru_page(page)) {
902            list_add_tail(&page->lru, pagelist);
903            inc_zone_page_state(page, NR_ISOLATED_ANON +
904                        page_is_file_cache(page));
905        }
906    }
907}
908
909static struct page *new_node_page(struct page *page, unsigned long node, int **x)
910{
911    return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
912}
913
914/*
915 * Migrate pages from one node to a target node.
916 * Returns error or the number of pages not migrated.
917 */
918static int migrate_to_node(struct mm_struct *mm, int source, int dest,
919               int flags)
920{
921    nodemask_t nmask;
922    LIST_HEAD(pagelist);
923    int err = 0;
924    struct vm_area_struct *vma;
925
926    nodes_clear(nmask);
927    node_set(source, nmask);
928
929    vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
930            flags | MPOL_MF_DISCONTIG_OK, &pagelist);
931    if (IS_ERR(vma))
932        return PTR_ERR(vma);
933
934    if (!list_empty(&pagelist)) {
935        err = migrate_pages(&pagelist, new_node_page, dest,
936                                false, true);
937        if (err)
938            putback_lru_pages(&pagelist);
939    }
940
941    return err;
942}
943
944/*
945 * Move pages between the two nodesets so as to preserve the physical
946 * layout as much as possible.
947 *
948 * Returns the number of page that could not be moved.
949 */
950int do_migrate_pages(struct mm_struct *mm,
951    const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
952{
953    int busy = 0;
954    int err;
955    nodemask_t tmp;
956
957    err = migrate_prep();
958    if (err)
959        return err;
960
961    down_read(&mm->mmap_sem);
962
963    err = migrate_vmas(mm, from_nodes, to_nodes, flags);
964    if (err)
965        goto out;
966
967    /*
968     * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
969     * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
970     * bit in 'tmp', and return that <source, dest> pair for migration.
971     * The pair of nodemasks 'to' and 'from' define the map.
972     *
973     * If no pair of bits is found that way, fallback to picking some
974     * pair of 'source' and 'dest' bits that are not the same. If the
975     * 'source' and 'dest' bits are the same, this represents a node
976     * that will be migrating to itself, so no pages need move.
977     *
978     * If no bits are left in 'tmp', or if all remaining bits left
979     * in 'tmp' correspond to the same bit in 'to', return false
980     * (nothing left to migrate).
981     *
982     * This lets us pick a pair of nodes to migrate between, such that
983     * if possible the dest node is not already occupied by some other
984     * source node, minimizing the risk of overloading the memory on a
985     * node that would happen if we migrated incoming memory to a node
986     * before migrating outgoing memory source that same node.
987     *
988     * A single scan of tmp is sufficient. As we go, we remember the
989     * most recent <s, d> pair that moved (s != d). If we find a pair
990     * that not only moved, but what's better, moved to an empty slot
991     * (d is not set in tmp), then we break out then, with that pair.
992     * Otherwise when we finish scanning from_tmp, we at least have the
993     * most recent <s, d> pair that moved. If we get all the way through
994     * the scan of tmp without finding any node that moved, much less
995     * moved to an empty node, then there is nothing left worth migrating.
996     */
997
998    tmp = *from_nodes;
999    while (!nodes_empty(tmp)) {
1000        int s,d;
1001        int source = -1;
1002        int dest = 0;
1003
1004        for_each_node_mask(s, tmp) {
1005            d = node_remap(s, *from_nodes, *to_nodes);
1006            if (s == d)
1007                continue;
1008
1009            source = s; /* Node moved. Memorize */
1010            dest = d;
1011
1012            /* dest not in remaining from nodes? */
1013            if (!node_isset(dest, tmp))
1014                break;
1015        }
1016        if (source == -1)
1017            break;
1018
1019        node_clear(source, tmp);
1020        err = migrate_to_node(mm, source, dest, flags);
1021        if (err > 0)
1022            busy += err;
1023        if (err < 0)
1024            break;
1025    }
1026out:
1027    up_read(&mm->mmap_sem);
1028    if (err < 0)
1029        return err;
1030    return busy;
1031
1032}
1033
1034/*
1035 * Allocate a new page for page migration based on vma policy.
1036 * Start assuming that page is mapped by vma pointed to by @private.
1037 * Search forward from there, if not. N.B., this assumes that the
1038 * list of pages handed to migrate_pages()--which is how we get here--
1039 * is in virtual address order.
1040 */
1041static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1042{
1043    struct vm_area_struct *vma = (struct vm_area_struct *)private;
1044    unsigned long uninitialized_var(address);
1045
1046    while (vma) {
1047        address = page_address_in_vma(page, vma);
1048        if (address != -EFAULT)
1049            break;
1050        vma = vma->vm_next;
1051    }
1052
1053    /*
1054     * if !vma, alloc_page_vma() will use task or system default policy
1055     */
1056    return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1057}
1058#else
1059
1060static void migrate_page_add(struct page *page, struct list_head *pagelist,
1061                unsigned long flags)
1062{
1063}
1064
1065int do_migrate_pages(struct mm_struct *mm,
1066    const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
1067{
1068    return -ENOSYS;
1069}
1070
1071static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1072{
1073    return NULL;
1074}
1075#endif
1076
1077static long do_mbind(unsigned long start, unsigned long len,
1078             unsigned short mode, unsigned short mode_flags,
1079             nodemask_t *nmask, unsigned long flags)
1080{
1081    struct vm_area_struct *vma;
1082    struct mm_struct *mm = current->mm;
1083    struct mempolicy *new;
1084    unsigned long end;
1085    int err;
1086    LIST_HEAD(pagelist);
1087
1088    if (flags & ~(unsigned long)(MPOL_MF_STRICT |
1089                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
1090        return -EINVAL;
1091    if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1092        return -EPERM;
1093
1094    if (start & ~PAGE_MASK)
1095        return -EINVAL;
1096
1097    if (mode == MPOL_DEFAULT)
1098        flags &= ~MPOL_MF_STRICT;
1099
1100    len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1101    end = start + len;
1102
1103    if (end < start)
1104        return -EINVAL;
1105    if (end == start)
1106        return 0;
1107
1108    new = mpol_new(mode, mode_flags, nmask);
1109    if (IS_ERR(new))
1110        return PTR_ERR(new);
1111
1112    /*
1113     * If we are using the default policy then operation
1114     * on discontinuous address spaces is okay after all
1115     */
1116    if (!new)
1117        flags |= MPOL_MF_DISCONTIG_OK;
1118
1119    pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1120         start, start + len, mode, mode_flags,
1121         nmask ? nodes_addr(*nmask)[0] : -1);
1122
1123    if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1124
1125        err = migrate_prep();
1126        if (err)
1127            goto mpol_out;
1128    }
1129    {
1130        NODEMASK_SCRATCH(scratch);
1131        if (scratch) {
1132            down_write(&mm->mmap_sem);
1133            task_lock(current);
1134            err = mpol_set_nodemask(new, nmask, scratch);
1135            task_unlock(current);
1136            if (err)
1137                up_write(&mm->mmap_sem);
1138        } else
1139            err = -ENOMEM;
1140        NODEMASK_SCRATCH_FREE(scratch);
1141    }
1142    if (err)
1143        goto mpol_out;
1144
1145    vma = check_range(mm, start, end, nmask,
1146              flags | MPOL_MF_INVERT, &pagelist);
1147
1148    err = PTR_ERR(vma);
1149    if (!IS_ERR(vma)) {
1150        int nr_failed = 0;
1151
1152        err = mbind_range(mm, start, end, new);
1153
1154        if (!list_empty(&pagelist)) {
1155            nr_failed = migrate_pages(&pagelist, new_vma_page,
1156                        (unsigned long)vma,
1157                        false, true);
1158            if (nr_failed)
1159                putback_lru_pages(&pagelist);
1160        }
1161
1162        if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1163            err = -EIO;
1164    } else
1165        putback_lru_pages(&pagelist);
1166
1167    up_write(&mm->mmap_sem);
1168 mpol_out:
1169    mpol_put(new);
1170    return err;
1171}
1172
1173/*
1174 * User space interface with variable sized bitmaps for nodelists.
1175 */
1176
1177/* Copy a node mask from user space. */
1178static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1179             unsigned long maxnode)
1180{
1181    unsigned long k;
1182    unsigned long nlongs;
1183    unsigned long endmask;
1184
1185    --maxnode;
1186    nodes_clear(*nodes);
1187    if (maxnode == 0 || !nmask)
1188        return 0;
1189    if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1190        return -EINVAL;
1191
1192    nlongs = BITS_TO_LONGS(maxnode);
1193    if ((maxnode % BITS_PER_LONG) == 0)
1194        endmask = ~0UL;
1195    else
1196        endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1197
1198    /* When the user specified more nodes than supported just check
1199       if the non supported part is all zero. */
1200    if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1201        if (nlongs > PAGE_SIZE/sizeof(long))
1202            return -EINVAL;
1203        for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1204            unsigned long t;
1205            if (get_user(t, nmask + k))
1206                return -EFAULT;
1207            if (k == nlongs - 1) {
1208                if (t & endmask)
1209                    return -EINVAL;
1210            } else if (t)
1211                return -EINVAL;
1212        }
1213        nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1214        endmask = ~0UL;
1215    }
1216
1217    if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1218        return -EFAULT;
1219    nodes_addr(*nodes)[nlongs-1] &= endmask;
1220    return 0;
1221}
1222
1223/* Copy a kernel node mask to user space */
1224static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1225                  nodemask_t *nodes)
1226{
1227    unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1228    const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1229
1230    if (copy > nbytes) {
1231        if (copy > PAGE_SIZE)
1232            return -EINVAL;
1233        if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1234            return -EFAULT;
1235        copy = nbytes;
1236    }
1237    return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1238}
1239
1240SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1241        unsigned long, mode, unsigned long __user *, nmask,
1242        unsigned long, maxnode, unsigned, flags)
1243{
1244    nodemask_t nodes;
1245    int err;
1246    unsigned short mode_flags;
1247
1248    mode_flags = mode & MPOL_MODE_FLAGS;
1249    mode &= ~MPOL_MODE_FLAGS;
1250    if (mode >= MPOL_MAX)
1251        return -EINVAL;
1252    if ((mode_flags & MPOL_F_STATIC_NODES) &&
1253        (mode_flags & MPOL_F_RELATIVE_NODES))
1254        return -EINVAL;
1255    err = get_nodes(&nodes, nmask, maxnode);
1256    if (err)
1257        return err;
1258    return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1259}
1260
1261/* Set the process memory policy */
1262SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1263        unsigned long, maxnode)
1264{
1265    int err;
1266    nodemask_t nodes;
1267    unsigned short flags;
1268
1269    flags = mode & MPOL_MODE_FLAGS;
1270    mode &= ~MPOL_MODE_FLAGS;
1271    if ((unsigned int)mode >= MPOL_MAX)
1272        return -EINVAL;
1273    if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1274        return -EINVAL;
1275    err = get_nodes(&nodes, nmask, maxnode);
1276    if (err)
1277        return err;
1278    return do_set_mempolicy(mode, flags, &nodes);
1279}
1280
1281SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1282        const unsigned long __user *, old_nodes,
1283        const unsigned long __user *, new_nodes)
1284{
1285    const struct cred *cred = current_cred(), *tcred;
1286    struct mm_struct *mm = NULL;
1287    struct task_struct *task;
1288    nodemask_t task_nodes;
1289    int err;
1290    nodemask_t *old;
1291    nodemask_t *new;
1292    NODEMASK_SCRATCH(scratch);
1293
1294    if (!scratch)
1295        return -ENOMEM;
1296
1297    old = &scratch->mask1;
1298    new = &scratch->mask2;
1299
1300    err = get_nodes(old, old_nodes, maxnode);
1301    if (err)
1302        goto out;
1303
1304    err = get_nodes(new, new_nodes, maxnode);
1305    if (err)
1306        goto out;
1307
1308    /* Find the mm_struct */
1309    rcu_read_lock();
1310    task = pid ? find_task_by_vpid(pid) : current;
1311    if (!task) {
1312        rcu_read_unlock();
1313        err = -ESRCH;
1314        goto out;
1315    }
1316    mm = get_task_mm(task);
1317    rcu_read_unlock();
1318
1319    err = -EINVAL;
1320    if (!mm)
1321        goto out;
1322
1323    /*
1324     * Check if this process has the right to modify the specified
1325     * process. The right exists if the process has administrative
1326     * capabilities, superuser privileges or the same
1327     * userid as the target process.
1328     */
1329    rcu_read_lock();
1330    tcred = __task_cred(task);
1331    if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1332        cred->uid != tcred->suid && cred->uid != tcred->uid &&
1333        !capable(CAP_SYS_NICE)) {
1334        rcu_read_unlock();
1335        err = -EPERM;
1336        goto out;
1337    }
1338    rcu_read_unlock();
1339
1340    task_nodes = cpuset_mems_allowed(task);
1341    /* Is the user allowed to access the target nodes? */
1342    if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1343        err = -EPERM;
1344        goto out;
1345    }
1346
1347    if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1348        err = -EINVAL;
1349        goto out;
1350    }
1351
1352    err = security_task_movememory(task);
1353    if (err)
1354        goto out;
1355
1356    err = do_migrate_pages(mm, old, new,
1357        capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1358out:
1359    if (mm)
1360        mmput(mm);
1361    NODEMASK_SCRATCH_FREE(scratch);
1362
1363    return err;
1364}
1365
1366
1367/* Retrieve NUMA policy */
1368SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1369        unsigned long __user *, nmask, unsigned long, maxnode,
1370        unsigned long, addr, unsigned long, flags)
1371{
1372    int err;
1373    int uninitialized_var(pval);
1374    nodemask_t nodes;
1375
1376    if (nmask != NULL && maxnode < MAX_NUMNODES)
1377        return -EINVAL;
1378
1379    err = do_get_mempolicy(&pval, &nodes, addr, flags);
1380
1381    if (err)
1382        return err;
1383
1384    if (policy && put_user(pval, policy))
1385        return -EFAULT;
1386
1387    if (nmask)
1388        err = copy_nodes_to_user(nmask, maxnode, &nodes);
1389
1390    return err;
1391}
1392
1393#ifdef CONFIG_COMPAT
1394
1395asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1396                     compat_ulong_t __user *nmask,
1397                     compat_ulong_t maxnode,
1398                     compat_ulong_t addr, compat_ulong_t flags)
1399{
1400    long err;
1401    unsigned long __user *nm = NULL;
1402    unsigned long nr_bits, alloc_size;
1403    DECLARE_BITMAP(bm, MAX_NUMNODES);
1404
1405    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1406    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1407
1408    if (nmask)
1409        nm = compat_alloc_user_space(alloc_size);
1410
1411    err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1412
1413    if (!err && nmask) {
1414        err = copy_from_user(bm, nm, alloc_size);
1415        /* ensure entire bitmap is zeroed */
1416        err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1417        err |= compat_put_bitmap(nmask, bm, nr_bits);
1418    }
1419
1420    return err;
1421}
1422
1423asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1424                     compat_ulong_t maxnode)
1425{
1426    long err = 0;
1427    unsigned long __user *nm = NULL;
1428    unsigned long nr_bits, alloc_size;
1429    DECLARE_BITMAP(bm, MAX_NUMNODES);
1430
1431    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1432    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1433
1434    if (nmask) {
1435        err = compat_get_bitmap(bm, nmask, nr_bits);
1436        nm = compat_alloc_user_space(alloc_size);
1437        err |= copy_to_user(nm, bm, alloc_size);
1438    }
1439
1440    if (err)
1441        return -EFAULT;
1442
1443    return sys_set_mempolicy(mode, nm, nr_bits+1);
1444}
1445
1446asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1447                 compat_ulong_t mode, compat_ulong_t __user *nmask,
1448                 compat_ulong_t maxnode, compat_ulong_t flags)
1449{
1450    long err = 0;
1451    unsigned long __user *nm = NULL;
1452    unsigned long nr_bits, alloc_size;
1453    nodemask_t bm;
1454
1455    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1456    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1457
1458    if (nmask) {
1459        err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1460        nm = compat_alloc_user_space(alloc_size);
1461        err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1462    }
1463
1464    if (err)
1465        return -EFAULT;
1466
1467    return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1468}
1469
1470#endif
1471
1472/*
1473 * get_vma_policy(@task, @vma, @addr)
1474 * @task - task for fallback if vma policy == default
1475 * @vma - virtual memory area whose policy is sought
1476 * @addr - address in @vma for shared policy lookup
1477 *
1478 * Returns effective policy for a VMA at specified address.
1479 * Falls back to @task or system default policy, as necessary.
1480 * Current or other task's task mempolicy and non-shared vma policies
1481 * are protected by the task's mmap_sem, which must be held for read by
1482 * the caller.
1483 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1484 * count--added by the get_policy() vm_op, as appropriate--to protect against
1485 * freeing by another task. It is the caller's responsibility to free the
1486 * extra reference for shared policies.
1487 */
1488struct mempolicy *get_vma_policy(struct task_struct *task,
1489        struct vm_area_struct *vma, unsigned long addr)
1490{
1491    struct mempolicy *pol = task->mempolicy;
1492
1493    if (vma) {
1494        if (vma->vm_ops && vma->vm_ops->get_policy) {
1495            struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1496                                    addr);
1497            if (vpol)
1498                pol = vpol;
1499        } else if (vma->vm_policy)
1500            pol = vma->vm_policy;
1501    }
1502    if (!pol)
1503        pol = &default_policy;
1504    return pol;
1505}
1506
1507/*
1508 * Return a nodemask representing a mempolicy for filtering nodes for
1509 * page allocation
1510 */
1511static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1512{
1513    /* Lower zones don't get a nodemask applied for MPOL_BIND */
1514    if (unlikely(policy->mode == MPOL_BIND) &&
1515            gfp_zone(gfp) >= policy_zone &&
1516            cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1517        return &policy->v.nodes;
1518
1519    return NULL;
1520}
1521
1522/* Return a zonelist indicated by gfp for node representing a mempolicy */
1523static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1524    int nd)
1525{
1526    switch (policy->mode) {
1527    case MPOL_PREFERRED:
1528        if (!(policy->flags & MPOL_F_LOCAL))
1529            nd = policy->v.preferred_node;
1530        break;
1531    case MPOL_BIND:
1532        /*
1533         * Normally, MPOL_BIND allocations are node-local within the
1534         * allowed nodemask. However, if __GFP_THISNODE is set and the
1535         * current node isn't part of the mask, we use the zonelist for
1536         * the first node in the mask instead.
1537         */
1538        if (unlikely(gfp & __GFP_THISNODE) &&
1539                unlikely(!node_isset(nd, policy->v.nodes)))
1540            nd = first_node(policy->v.nodes);
1541        break;
1542    default:
1543        BUG();
1544    }
1545    return node_zonelist(nd, gfp);
1546}
1547
1548/* Do dynamic interleaving for a process */
1549static unsigned interleave_nodes(struct mempolicy *policy)
1550{
1551    unsigned nid, next;
1552    struct task_struct *me = current;
1553
1554    nid = me->il_next;
1555    next = next_node(nid, policy->v.nodes);
1556    if (next >= MAX_NUMNODES)
1557        next = first_node(policy->v.nodes);
1558    if (next < MAX_NUMNODES)
1559        me->il_next = next;
1560    return nid;
1561}
1562
1563/*
1564 * Depending on the memory policy provide a node from which to allocate the
1565 * next slab entry.
1566 * @policy must be protected by freeing by the caller. If @policy is
1567 * the current task's mempolicy, this protection is implicit, as only the
1568 * task can change it's policy. The system default policy requires no
1569 * such protection.
1570 */
1571unsigned slab_node(struct mempolicy *policy)
1572{
1573    if (!policy || policy->flags & MPOL_F_LOCAL)
1574        return numa_node_id();
1575
1576    switch (policy->mode) {
1577    case MPOL_PREFERRED:
1578        /*
1579         * handled MPOL_F_LOCAL above
1580         */
1581        return policy->v.preferred_node;
1582
1583    case MPOL_INTERLEAVE:
1584        return interleave_nodes(policy);
1585
1586    case MPOL_BIND: {
1587        /*
1588         * Follow bind policy behavior and start allocation at the
1589         * first node.
1590         */
1591        struct zonelist *zonelist;
1592        struct zone *zone;
1593        enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1594        zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1595        (void)first_zones_zonelist(zonelist, highest_zoneidx,
1596                            &policy->v.nodes,
1597                            &zone);
1598        return zone ? zone->node : numa_node_id();
1599    }
1600
1601    default:
1602        BUG();
1603    }
1604}
1605
1606/* Do static interleaving for a VMA with known offset. */
1607static unsigned offset_il_node(struct mempolicy *pol,
1608        struct vm_area_struct *vma, unsigned long off)
1609{
1610    unsigned nnodes = nodes_weight(pol->v.nodes);
1611    unsigned target;
1612    int c;
1613    int nid = -1;
1614
1615    if (!nnodes)
1616        return numa_node_id();
1617    target = (unsigned int)off % nnodes;
1618    c = 0;
1619    do {
1620        nid = next_node(nid, pol->v.nodes);
1621        c++;
1622    } while (c <= target);
1623    return nid;
1624}
1625
1626/* Determine a node number for interleave */
1627static inline unsigned interleave_nid(struct mempolicy *pol,
1628         struct vm_area_struct *vma, unsigned long addr, int shift)
1629{
1630    if (vma) {
1631        unsigned long off;
1632
1633        /*
1634         * for small pages, there is no difference between
1635         * shift and PAGE_SHIFT, so the bit-shift is safe.
1636         * for huge pages, since vm_pgoff is in units of small
1637         * pages, we need to shift off the always 0 bits to get
1638         * a useful offset.
1639         */
1640        BUG_ON(shift < PAGE_SHIFT);
1641        off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1642        off += (addr - vma->vm_start) >> shift;
1643        return offset_il_node(pol, vma, off);
1644    } else
1645        return interleave_nodes(pol);
1646}
1647
1648#ifdef CONFIG_HUGETLBFS
1649/*
1650 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1651 * @vma = virtual memory area whose policy is sought
1652 * @addr = address in @vma for shared policy lookup and interleave policy
1653 * @gfp_flags = for requested zone
1654 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1655 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1656 *
1657 * Returns a zonelist suitable for a huge page allocation and a pointer
1658 * to the struct mempolicy for conditional unref after allocation.
1659 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1660 * @nodemask for filtering the zonelist.
1661 *
1662 * Must be protected by get_mems_allowed()
1663 */
1664struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1665                gfp_t gfp_flags, struct mempolicy **mpol,
1666                nodemask_t **nodemask)
1667{
1668    struct zonelist *zl;
1669
1670    *mpol = get_vma_policy(current, vma, addr);
1671    *nodemask = NULL; /* assume !MPOL_BIND */
1672
1673    if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1674        zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1675                huge_page_shift(hstate_vma(vma))), gfp_flags);
1676    } else {
1677        zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1678        if ((*mpol)->mode == MPOL_BIND)
1679            *nodemask = &(*mpol)->v.nodes;
1680    }
1681    return zl;
1682}
1683
1684/*
1685 * init_nodemask_of_mempolicy
1686 *
1687 * If the current task's mempolicy is "default" [NULL], return 'false'
1688 * to indicate default policy. Otherwise, extract the policy nodemask
1689 * for 'bind' or 'interleave' policy into the argument nodemask, or
1690 * initialize the argument nodemask to contain the single node for
1691 * 'preferred' or 'local' policy and return 'true' to indicate presence
1692 * of non-default mempolicy.
1693 *
1694 * We don't bother with reference counting the mempolicy [mpol_get/put]
1695 * because the current task is examining it's own mempolicy and a task's
1696 * mempolicy is only ever changed by the task itself.
1697 *
1698 * N.B., it is the caller's responsibility to free a returned nodemask.
1699 */
1700bool init_nodemask_of_mempolicy(nodemask_t *mask)
1701{
1702    struct mempolicy *mempolicy;
1703    int nid;
1704
1705    if (!(mask && current->mempolicy))
1706        return false;
1707
1708    task_lock(current);
1709    mempolicy = current->mempolicy;
1710    switch (mempolicy->mode) {
1711    case MPOL_PREFERRED:
1712        if (mempolicy->flags & MPOL_F_LOCAL)
1713            nid = numa_node_id();
1714        else
1715            nid = mempolicy->v.preferred_node;
1716        init_nodemask_of_node(mask, nid);
1717        break;
1718
1719    case MPOL_BIND:
1720        /* Fall through */
1721    case MPOL_INTERLEAVE:
1722        *mask = mempolicy->v.nodes;
1723        break;
1724
1725    default:
1726        BUG();
1727    }
1728    task_unlock(current);
1729
1730    return true;
1731}
1732#endif
1733
1734/*
1735 * mempolicy_nodemask_intersects
1736 *
1737 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1738 * policy. Otherwise, check for intersection between mask and the policy
1739 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1740 * policy, always return true since it may allocate elsewhere on fallback.
1741 *
1742 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1743 */
1744bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1745                    const nodemask_t *mask)
1746{
1747    struct mempolicy *mempolicy;
1748    bool ret = true;
1749
1750    if (!mask)
1751        return ret;
1752    task_lock(tsk);
1753    mempolicy = tsk->mempolicy;
1754    if (!mempolicy)
1755        goto out;
1756
1757    switch (mempolicy->mode) {
1758    case MPOL_PREFERRED:
1759        /*
1760         * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1761         * allocate from, they may fallback to other nodes when oom.
1762         * Thus, it's possible for tsk to have allocated memory from
1763         * nodes in mask.
1764         */
1765        break;
1766    case MPOL_BIND:
1767    case MPOL_INTERLEAVE:
1768        ret = nodes_intersects(mempolicy->v.nodes, *mask);
1769        break;
1770    default:
1771        BUG();
1772    }
1773out:
1774    task_unlock(tsk);
1775    return ret;
1776}
1777
1778/* Allocate a page in interleaved policy.
1779   Own path because it needs to do special accounting. */
1780static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1781                    unsigned nid)
1782{
1783    struct zonelist *zl;
1784    struct page *page;
1785
1786    zl = node_zonelist(nid, gfp);
1787    page = __alloc_pages(gfp, order, zl);
1788    if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1789        inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1790    return page;
1791}
1792
1793/**
1794 * alloc_pages_vma - Allocate a page for a VMA.
1795 *
1796 * @gfp:
1797 * %GFP_USER user allocation.
1798 * %GFP_KERNEL kernel allocations,
1799 * %GFP_HIGHMEM highmem/user allocations,
1800 * %GFP_FS allocation should not call back into a file system.
1801 * %GFP_ATOMIC don't sleep.
1802 *
1803 * @order:Order of the GFP allocation.
1804 * @vma: Pointer to VMA or NULL if not available.
1805 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1806 *
1807 * This function allocates a page from the kernel page pool and applies
1808 * a NUMA policy associated with the VMA or the current process.
1809 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1810 * mm_struct of the VMA to prevent it from going away. Should be used for
1811 * all allocations for pages that will be mapped into
1812 * user space. Returns NULL when no page can be allocated.
1813 *
1814 * Should be called with the mm_sem of the vma hold.
1815 */
1816struct page *
1817alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1818        unsigned long addr, int node)
1819{
1820    struct mempolicy *pol = get_vma_policy(current, vma, addr);
1821    struct zonelist *zl;
1822    struct page *page;
1823
1824    get_mems_allowed();
1825    if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1826        unsigned nid;
1827
1828        nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1829        mpol_cond_put(pol);
1830        page = alloc_page_interleave(gfp, order, nid);
1831        put_mems_allowed();
1832        return page;
1833    }
1834    zl = policy_zonelist(gfp, pol, node);
1835    if (unlikely(mpol_needs_cond_ref(pol))) {
1836        /*
1837         * slow path: ref counted shared policy
1838         */
1839        struct page *page = __alloc_pages_nodemask(gfp, order,
1840                        zl, policy_nodemask(gfp, pol));
1841        __mpol_put(pol);
1842        put_mems_allowed();
1843        return page;
1844    }
1845    /*
1846     * fast path: default or task policy
1847     */
1848    page = __alloc_pages_nodemask(gfp, order, zl,
1849                      policy_nodemask(gfp, pol));
1850    put_mems_allowed();
1851    return page;
1852}
1853
1854/**
1855 * alloc_pages_current - Allocate pages.
1856 *
1857 * @gfp:
1858 * %GFP_USER user allocation,
1859 * %GFP_KERNEL kernel allocation,
1860 * %GFP_HIGHMEM highmem allocation,
1861 * %GFP_FS don't call back into a file system.
1862 * %GFP_ATOMIC don't sleep.
1863 * @order: Power of two of allocation size in pages. 0 is a single page.
1864 *
1865 * Allocate a page from the kernel page pool. When not in
1866 * interrupt context and apply the current process NUMA policy.
1867 * Returns NULL when no page can be allocated.
1868 *
1869 * Don't call cpuset_update_task_memory_state() unless
1870 * 1) it's ok to take cpuset_sem (can WAIT), and
1871 * 2) allocating for current task (not interrupt).
1872 */
1873struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1874{
1875    struct mempolicy *pol = current->mempolicy;
1876    struct page *page;
1877
1878    if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1879        pol = &default_policy;
1880
1881    get_mems_allowed();
1882    /*
1883     * No reference counting needed for current->mempolicy
1884     * nor system default_policy
1885     */
1886    if (pol->mode == MPOL_INTERLEAVE)
1887        page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
1888    else
1889        page = __alloc_pages_nodemask(gfp, order,
1890                policy_zonelist(gfp, pol, numa_node_id()),
1891                policy_nodemask(gfp, pol));
1892    put_mems_allowed();
1893    return page;
1894}
1895EXPORT_SYMBOL(alloc_pages_current);
1896
1897/*
1898 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1899 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1900 * with the mems_allowed returned by cpuset_mems_allowed(). This
1901 * keeps mempolicies cpuset relative after its cpuset moves. See
1902 * further kernel/cpuset.c update_nodemask().
1903 *
1904 * current's mempolicy may be rebinded by the other task(the task that changes
1905 * cpuset's mems), so we needn't do rebind work for current task.
1906 */
1907
1908/* Slow path of a mempolicy duplicate */
1909struct mempolicy *__mpol_dup(struct mempolicy *old)
1910{
1911    struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1912
1913    if (!new)
1914        return ERR_PTR(-ENOMEM);
1915
1916    /* task's mempolicy is protected by alloc_lock */
1917    if (old == current->mempolicy) {
1918        task_lock(current);
1919        *new = *old;
1920        task_unlock(current);
1921    } else
1922        *new = *old;
1923
1924    rcu_read_lock();
1925    if (current_cpuset_is_being_rebound()) {
1926        nodemask_t mems = cpuset_mems_allowed(current);
1927        if (new->flags & MPOL_F_REBINDING)
1928            mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
1929        else
1930            mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
1931    }
1932    rcu_read_unlock();
1933    atomic_set(&new->refcnt, 1);
1934    return new;
1935}
1936
1937/*
1938 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1939 * eliminate the * MPOL_F_* flags that require conditional ref and
1940 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
1941 * after return. Use the returned value.
1942 *
1943 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1944 * policy lookup, even if the policy needs/has extra ref on lookup.
1945 * shmem_readahead needs this.
1946 */
1947struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1948                        struct mempolicy *frompol)
1949{
1950    if (!mpol_needs_cond_ref(frompol))
1951        return frompol;
1952
1953    *tompol = *frompol;
1954    tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
1955    __mpol_put(frompol);
1956    return tompol;
1957}
1958
1959/* Slow path of a mempolicy comparison */
1960int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1961{
1962    if (!a || !b)
1963        return 0;
1964    if (a->mode != b->mode)
1965        return 0;
1966    if (a->flags != b->flags)
1967        return 0;
1968    if (mpol_store_user_nodemask(a))
1969        if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1970            return 0;
1971
1972    switch (a->mode) {
1973    case MPOL_BIND:
1974        /* Fall through */
1975    case MPOL_INTERLEAVE:
1976        return nodes_equal(a->v.nodes, b->v.nodes);
1977    case MPOL_PREFERRED:
1978        return a->v.preferred_node == b->v.preferred_node;
1979    default:
1980        BUG();
1981        return 0;
1982    }
1983}
1984
1985/*
1986 * Shared memory backing store policy support.
1987 *
1988 * Remember policies even when nobody has shared memory mapped.
1989 * The policies are kept in Red-Black tree linked from the inode.
1990 * They are protected by the sp->lock spinlock, which should be held
1991 * for any accesses to the tree.
1992 */
1993
1994/* lookup first element intersecting start-end */
1995/* Caller holds sp->lock */
1996static struct sp_node *
1997sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1998{
1999    struct rb_node *n = sp->root.rb_node;
2000
2001    while (n) {
2002        struct sp_node *p = rb_entry(n, struct sp_node, nd);
2003
2004        if (start >= p->end)
2005            n = n->rb_right;
2006        else if (end <= p->start)
2007            n = n->rb_left;
2008        else
2009            break;
2010    }
2011    if (!n)
2012        return NULL;
2013    for (;;) {
2014        struct sp_node *w = NULL;
2015        struct rb_node *prev = rb_prev(n);
2016        if (!prev)
2017            break;
2018        w = rb_entry(prev, struct sp_node, nd);
2019        if (w->end <= start)
2020            break;
2021        n = prev;
2022    }
2023    return rb_entry(n, struct sp_node, nd);
2024}
2025
2026/* Insert a new shared policy into the list. */
2027/* Caller holds sp->lock */
2028static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2029{
2030    struct rb_node **p = &sp->root.rb_node;
2031    struct rb_node *parent = NULL;
2032    struct sp_node *nd;
2033
2034    while (*p) {
2035        parent = *p;
2036        nd = rb_entry(parent, struct sp_node, nd);
2037        if (new->start < nd->start)
2038            p = &(*p)->rb_left;
2039        else if (new->end > nd->end)
2040            p = &(*p)->rb_right;
2041        else
2042            BUG();
2043    }
2044    rb_link_node(&new->nd, parent, p);
2045    rb_insert_color(&new->nd, &sp->root);
2046    pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2047         new->policy ? new->policy->mode : 0);
2048}
2049
2050/* Find shared policy intersecting idx */
2051struct mempolicy *
2052mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2053{
2054    struct mempolicy *pol = NULL;
2055    struct sp_node *sn;
2056
2057    if (!sp->root.rb_node)
2058        return NULL;
2059    spin_lock(&sp->lock);
2060    sn = sp_lookup(sp, idx, idx+1);
2061    if (sn) {
2062        mpol_get(sn->policy);
2063        pol = sn->policy;
2064    }
2065    spin_unlock(&sp->lock);
2066    return pol;
2067}
2068
2069static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2070{
2071    pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2072    rb_erase(&n->nd, &sp->root);
2073    mpol_put(n->policy);
2074    kmem_cache_free(sn_cache, n);
2075}
2076
2077static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2078                struct mempolicy *pol)
2079{
2080    struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2081
2082    if (!n)
2083        return NULL;
2084    n->start = start;
2085    n->end = end;
2086    mpol_get(pol);
2087    pol->flags |= MPOL_F_SHARED; /* for unref */
2088    n->policy = pol;
2089    return n;
2090}
2091
2092/* Replace a policy range. */
2093static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2094                 unsigned long end, struct sp_node *new)
2095{
2096    struct sp_node *n, *new2 = NULL;
2097
2098restart:
2099    spin_lock(&sp->lock);
2100    n = sp_lookup(sp, start, end);
2101    /* Take care of old policies in the same range. */
2102    while (n && n->start < end) {
2103        struct rb_node *next = rb_next(&n->nd);
2104        if (n->start >= start) {
2105            if (n->end <= end)
2106                sp_delete(sp, n);
2107            else
2108                n->start = end;
2109        } else {
2110            /* Old policy spanning whole new range. */
2111            if (n->end > end) {
2112                if (!new2) {
2113                    spin_unlock(&sp->lock);
2114                    new2 = sp_alloc(end, n->end, n->policy);
2115                    if (!new2)
2116                        return -ENOMEM;
2117                    goto restart;
2118                }
2119                n->end = start;
2120                sp_insert(sp, new2);
2121                new2 = NULL;
2122                break;
2123            } else
2124                n->end = start;
2125        }
2126        if (!next)
2127            break;
2128        n = rb_entry(next, struct sp_node, nd);
2129    }
2130    if (new)
2131        sp_insert(sp, new);
2132    spin_unlock(&sp->lock);
2133    if (new2) {
2134        mpol_put(new2->policy);
2135        kmem_cache_free(sn_cache, new2);
2136    }
2137    return 0;
2138}
2139
2140/**
2141 * mpol_shared_policy_init - initialize shared policy for inode
2142 * @sp: pointer to inode shared policy
2143 * @mpol: struct mempolicy to install
2144 *
2145 * Install non-NULL @mpol in inode's shared policy rb-tree.
2146 * On entry, the current task has a reference on a non-NULL @mpol.
2147 * This must be released on exit.
2148 * This is called at get_inode() calls and we can use GFP_KERNEL.
2149 */
2150void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2151{
2152    int ret;
2153
2154    sp->root = RB_ROOT; /* empty tree == default mempolicy */
2155    spin_lock_init(&sp->lock);
2156
2157    if (mpol) {
2158        struct vm_area_struct pvma;
2159        struct mempolicy *new;
2160        NODEMASK_SCRATCH(scratch);
2161
2162        if (!scratch)
2163            goto put_mpol;
2164        /* contextualize the tmpfs mount point mempolicy */
2165        new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2166        if (IS_ERR(new))
2167            goto free_scratch; /* no valid nodemask intersection */
2168
2169        task_lock(current);
2170        ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2171        task_unlock(current);
2172        if (ret)
2173            goto put_new;
2174
2175        /* Create pseudo-vma that contains just the policy */
2176        memset(&pvma, 0, sizeof(struct vm_area_struct));
2177        pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2178        mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2179
2180put_new:
2181        mpol_put(new); /* drop initial ref */
2182free_scratch:
2183        NODEMASK_SCRATCH_FREE(scratch);
2184put_mpol:
2185        mpol_put(mpol); /* drop our incoming ref on sb mpol */
2186    }
2187}
2188
2189int mpol_set_shared_policy(struct shared_policy *info,
2190            struct vm_area_struct *vma, struct mempolicy *npol)
2191{
2192    int err;
2193    struct sp_node *new = NULL;
2194    unsigned long sz = vma_pages(vma);
2195
2196    pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2197         vma->vm_pgoff,
2198         sz, npol ? npol->mode : -1,
2199         npol ? npol->flags : -1,
2200         npol ? nodes_addr(npol->v.nodes)[0] : -1);
2201
2202    if (npol) {
2203        new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2204        if (!new)
2205            return -ENOMEM;
2206    }
2207    err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2208    if (err && new)
2209        kmem_cache_free(sn_cache, new);
2210    return err;
2211}
2212
2213/* Free a backing policy store on inode delete. */
2214void mpol_free_shared_policy(struct shared_policy *p)
2215{
2216    struct sp_node *n;
2217    struct rb_node *next;
2218
2219    if (!p->root.rb_node)
2220        return;
2221    spin_lock(&p->lock);
2222    next = rb_first(&p->root);
2223    while (next) {
2224        n = rb_entry(next, struct sp_node, nd);
2225        next = rb_next(&n->nd);
2226        rb_erase(&n->nd, &p->root);
2227        mpol_put(n->policy);
2228        kmem_cache_free(sn_cache, n);
2229    }
2230    spin_unlock(&p->lock);
2231}
2232
2233/* assumes fs == KERNEL_DS */
2234void __init numa_policy_init(void)
2235{
2236    nodemask_t interleave_nodes;
2237    unsigned long largest = 0;
2238    int nid, prefer = 0;
2239
2240    policy_cache = kmem_cache_create("numa_policy",
2241                     sizeof(struct mempolicy),
2242                     0, SLAB_PANIC, NULL);
2243
2244    sn_cache = kmem_cache_create("shared_policy_node",
2245                     sizeof(struct sp_node),
2246                     0, SLAB_PANIC, NULL);
2247
2248    /*
2249     * Set interleaving policy for system init. Interleaving is only
2250     * enabled across suitably sized nodes (default is >= 16MB), or
2251     * fall back to the largest node if they're all smaller.
2252     */
2253    nodes_clear(interleave_nodes);
2254    for_each_node_state(nid, N_HIGH_MEMORY) {
2255        unsigned long total_pages = node_present_pages(nid);
2256
2257        /* Preserve the largest node */
2258        if (largest < total_pages) {
2259            largest = total_pages;
2260            prefer = nid;
2261        }
2262
2263        /* Interleave this node? */
2264        if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2265            node_set(nid, interleave_nodes);
2266    }
2267
2268    /* All too small, use the largest */
2269    if (unlikely(nodes_empty(interleave_nodes)))
2270        node_set(prefer, interleave_nodes);
2271
2272    if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2273        printk("numa_policy_init: interleaving failed\n");
2274}
2275
2276/* Reset policy of current process to default */
2277void numa_default_policy(void)
2278{
2279    do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2280}
2281
2282/*
2283 * Parse and format mempolicy from/to strings
2284 */
2285
2286/*
2287 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
2288 * Used only for mpol_parse_str() and mpol_to_str()
2289 */
2290#define MPOL_LOCAL MPOL_MAX
2291static const char * const policy_modes[] =
2292{
2293    [MPOL_DEFAULT] = "default",
2294    [MPOL_PREFERRED] = "prefer",
2295    [MPOL_BIND] = "bind",
2296    [MPOL_INTERLEAVE] = "interleave",
2297    [MPOL_LOCAL] = "local"
2298};
2299
2300
2301#ifdef CONFIG_TMPFS
2302/**
2303 * mpol_parse_str - parse string to mempolicy
2304 * @str: string containing mempolicy to parse
2305 * @mpol: pointer to struct mempolicy pointer, returned on success.
2306 * @no_context: flag whether to "contextualize" the mempolicy
2307 *
2308 * Format of input:
2309 * <mode>[=<flags>][:<nodelist>]
2310 *
2311 * if @no_context is true, save the input nodemask in w.user_nodemask in
2312 * the returned mempolicy. This will be used to "clone" the mempolicy in
2313 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
2314 * mount option. Note that if 'static' or 'relative' mode flags were
2315 * specified, the input nodemask will already have been saved. Saving
2316 * it again is redundant, but safe.
2317 *
2318 * On success, returns 0, else 1
2319 */
2320int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2321{
2322    struct mempolicy *new = NULL;
2323    unsigned short mode;
2324    unsigned short uninitialized_var(mode_flags);
2325    nodemask_t nodes;
2326    char *nodelist = strchr(str, ':');
2327    char *flags = strchr(str, '=');
2328    int err = 1;
2329
2330    if (nodelist) {
2331        /* NUL-terminate mode or flags string */
2332        *nodelist++ = '\0';
2333        if (nodelist_parse(nodelist, nodes))
2334            goto out;
2335        if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2336            goto out;
2337    } else
2338        nodes_clear(nodes);
2339
2340    if (flags)
2341        *flags++ = '\0'; /* terminate mode string */
2342
2343    for (mode = 0; mode <= MPOL_LOCAL; mode++) {
2344        if (!strcmp(str, policy_modes[mode])) {
2345            break;
2346        }
2347    }
2348    if (mode > MPOL_LOCAL)
2349        goto out;
2350
2351    switch (mode) {
2352    case MPOL_PREFERRED:
2353        /*
2354         * Insist on a nodelist of one node only
2355         */
2356        if (nodelist) {
2357            char *rest = nodelist;
2358            while (isdigit(*rest))
2359                rest++;
2360            if (*rest)
2361                goto out;
2362        }
2363        break;
2364    case MPOL_INTERLEAVE:
2365        /*
2366         * Default to online nodes with memory if no nodelist
2367         */
2368        if (!nodelist)
2369            nodes = node_states[N_HIGH_MEMORY];
2370        break;
2371    case MPOL_LOCAL:
2372        /*
2373         * Don't allow a nodelist; mpol_new() checks flags
2374         */
2375        if (nodelist)
2376            goto out;
2377        mode = MPOL_PREFERRED;
2378        break;
2379    case MPOL_DEFAULT:
2380        /*
2381         * Insist on a empty nodelist
2382         */
2383        if (!nodelist)
2384            err = 0;
2385        goto out;
2386    case MPOL_BIND:
2387        /*
2388         * Insist on a nodelist
2389         */
2390        if (!nodelist)
2391            goto out;
2392    }
2393
2394    mode_flags = 0;
2395    if (flags) {
2396        /*
2397         * Currently, we only support two mutually exclusive
2398         * mode flags.
2399         */
2400        if (!strcmp(flags, "static"))
2401            mode_flags |= MPOL_F_STATIC_NODES;
2402        else if (!strcmp(flags, "relative"))
2403            mode_flags |= MPOL_F_RELATIVE_NODES;
2404        else
2405            goto out;
2406    }
2407
2408    new = mpol_new(mode, mode_flags, &nodes);
2409    if (IS_ERR(new))
2410        goto out;
2411
2412    if (no_context) {
2413        /* save for contextualization */
2414        new->w.user_nodemask = nodes;
2415    } else {
2416        int ret;
2417        NODEMASK_SCRATCH(scratch);
2418        if (scratch) {
2419            task_lock(current);
2420            ret = mpol_set_nodemask(new, &nodes, scratch);
2421            task_unlock(current);
2422        } else
2423            ret = -ENOMEM;
2424        NODEMASK_SCRATCH_FREE(scratch);
2425        if (ret) {
2426            mpol_put(new);
2427            goto out;
2428        }
2429    }
2430    err = 0;
2431
2432out:
2433    /* Restore string for error message */
2434    if (nodelist)
2435        *--nodelist = ':';
2436    if (flags)
2437        *--flags = '=';
2438    if (!err)
2439        *mpol = new;
2440    return err;
2441}
2442#endif /* CONFIG_TMPFS */
2443
2444/**
2445 * mpol_to_str - format a mempolicy structure for printing
2446 * @buffer: to contain formatted mempolicy string
2447 * @maxlen: length of @buffer
2448 * @pol: pointer to mempolicy to be formatted
2449 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
2450 *
2451 * Convert a mempolicy into a string.
2452 * Returns the number of characters in buffer (if positive)
2453 * or an error (negative)
2454 */
2455int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2456{
2457    char *p = buffer;
2458    int l;
2459    nodemask_t nodes;
2460    unsigned short mode;
2461    unsigned short flags = pol ? pol->flags : 0;
2462
2463    /*
2464     * Sanity check: room for longest mode, flag and some nodes
2465     */
2466    VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2467
2468    if (!pol || pol == &default_policy)
2469        mode = MPOL_DEFAULT;
2470    else
2471        mode = pol->mode;
2472
2473    switch (mode) {
2474    case MPOL_DEFAULT:
2475        nodes_clear(nodes);
2476        break;
2477
2478    case MPOL_PREFERRED:
2479        nodes_clear(nodes);
2480        if (flags & MPOL_F_LOCAL)
2481            mode = MPOL_LOCAL; /* pseudo-policy */
2482        else
2483            node_set(pol->v.preferred_node, nodes);
2484        break;
2485
2486    case MPOL_BIND:
2487        /* Fall through */
2488    case MPOL_INTERLEAVE:
2489        if (no_context)
2490            nodes = pol->w.user_nodemask;
2491        else
2492            nodes = pol->v.nodes;
2493        break;
2494
2495    default:
2496        BUG();
2497    }
2498
2499    l = strlen(policy_modes[mode]);
2500    if (buffer + maxlen < p + l + 1)
2501        return -ENOSPC;
2502
2503    strcpy(p, policy_modes[mode]);
2504    p += l;
2505
2506    if (flags & MPOL_MODE_FLAGS) {
2507        if (buffer + maxlen < p + 2)
2508            return -ENOSPC;
2509        *p++ = '=';
2510
2511        /*
2512         * Currently, the only defined flags are mutually exclusive
2513         */
2514        if (flags & MPOL_F_STATIC_NODES)
2515            p += snprintf(p, buffer + maxlen - p, "static");
2516        else if (flags & MPOL_F_RELATIVE_NODES)
2517            p += snprintf(p, buffer + maxlen - p, "relative");
2518    }
2519
2520    if (!nodes_empty(nodes)) {
2521        if (buffer + maxlen < p + 2)
2522            return -ENOSPC;
2523        *p++ = ':';
2524         p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2525    }
2526    return p - buffer;
2527}
2528

Archive Download this file



interactive