Root/mm/mempolicy.c

1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/mm.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/slab.h>
79#include <linux/string.h>
80#include <linux/export.h>
81#include <linux/nsproxy.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/swap.h>
86#include <linux/seq_file.h>
87#include <linux/proc_fs.h>
88#include <linux/migrate.h>
89#include <linux/ksm.h>
90#include <linux/rmap.h>
91#include <linux/security.h>
92#include <linux/syscalls.h>
93#include <linux/ctype.h>
94#include <linux/mm_inline.h>
95#include <linux/mmu_notifier.h>
96#include <linux/printk.h>
97
98#include <asm/tlbflush.h>
99#include <asm/uaccess.h>
100#include <linux/random.h>
101
102#include "internal.h"
103
104/* Internal flags */
105#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
106#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
107
108static struct kmem_cache *policy_cache;
109static struct kmem_cache *sn_cache;
110
111/* Highest zone. An specific allocation for a zone below that is not
112   policied. */
113enum zone_type policy_zone = 0;
114
115/*
116 * run-time system-wide default policy => local allocation
117 */
118static struct mempolicy default_policy = {
119    .refcnt = ATOMIC_INIT(1), /* never free it */
120    .mode = MPOL_PREFERRED,
121    .flags = MPOL_F_LOCAL,
122};
123
124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125
126static struct mempolicy *get_task_policy(struct task_struct *p)
127{
128    struct mempolicy *pol = p->mempolicy;
129
130    if (!pol) {
131        int node = numa_node_id();
132
133        if (node != NUMA_NO_NODE) {
134            pol = &preferred_node_policy[node];
135            /*
136             * preferred_node_policy is not initialised early in
137             * boot
138             */
139            if (!pol->mode)
140                pol = NULL;
141        }
142    }
143
144    return pol;
145}
146
147static const struct mempolicy_operations {
148    int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
149    /*
150     * If read-side task has no lock to protect task->mempolicy, write-side
151     * task will rebind the task->mempolicy by two step. The first step is
152     * setting all the newly nodes, and the second step is cleaning all the
153     * disallowed nodes. In this way, we can avoid finding no node to alloc
154     * page.
155     * If we have a lock to protect task->mempolicy in read-side, we do
156     * rebind directly.
157     *
158     * step:
159     * MPOL_REBIND_ONCE - do rebind work at once
160     * MPOL_REBIND_STEP1 - set all the newly nodes
161     * MPOL_REBIND_STEP2 - clean all the disallowed nodes
162     */
163    void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
164            enum mpol_rebind_step step);
165} mpol_ops[MPOL_MAX];
166
167/* Check that the nodemask contains at least one populated zone */
168static int is_valid_nodemask(const nodemask_t *nodemask)
169{
170    return nodes_intersects(*nodemask, node_states[N_MEMORY]);
171}
172
173static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
174{
175    return pol->flags & MPOL_MODE_FLAGS;
176}
177
178static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
179                   const nodemask_t *rel)
180{
181    nodemask_t tmp;
182    nodes_fold(tmp, *orig, nodes_weight(*rel));
183    nodes_onto(*ret, tmp, *rel);
184}
185
186static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
187{
188    if (nodes_empty(*nodes))
189        return -EINVAL;
190    pol->v.nodes = *nodes;
191    return 0;
192}
193
194static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
195{
196    if (!nodes)
197        pol->flags |= MPOL_F_LOCAL; /* local allocation */
198    else if (nodes_empty(*nodes))
199        return -EINVAL; /* no allowed nodes */
200    else
201        pol->v.preferred_node = first_node(*nodes);
202    return 0;
203}
204
205static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
206{
207    if (!is_valid_nodemask(nodes))
208        return -EINVAL;
209    pol->v.nodes = *nodes;
210    return 0;
211}
212
213/*
214 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
215 * any, for the new policy. mpol_new() has already validated the nodes
216 * parameter with respect to the policy mode and flags. But, we need to
217 * handle an empty nodemask with MPOL_PREFERRED here.
218 *
219 * Must be called holding task's alloc_lock to protect task's mems_allowed
220 * and mempolicy. May also be called holding the mmap_semaphore for write.
221 */
222static int mpol_set_nodemask(struct mempolicy *pol,
223             const nodemask_t *nodes, struct nodemask_scratch *nsc)
224{
225    int ret;
226
227    /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
228    if (pol == NULL)
229        return 0;
230    /* Check N_MEMORY */
231    nodes_and(nsc->mask1,
232          cpuset_current_mems_allowed, node_states[N_MEMORY]);
233
234    VM_BUG_ON(!nodes);
235    if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
236        nodes = NULL; /* explicit local allocation */
237    else {
238        if (pol->flags & MPOL_F_RELATIVE_NODES)
239            mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
240        else
241            nodes_and(nsc->mask2, *nodes, nsc->mask1);
242
243        if (mpol_store_user_nodemask(pol))
244            pol->w.user_nodemask = *nodes;
245        else
246            pol->w.cpuset_mems_allowed =
247                        cpuset_current_mems_allowed;
248    }
249
250    if (nodes)
251        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
252    else
253        ret = mpol_ops[pol->mode].create(pol, NULL);
254    return ret;
255}
256
257/*
258 * This function just creates a new policy, does some check and simple
259 * initialization. You must invoke mpol_set_nodemask() to set nodes.
260 */
261static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
262                  nodemask_t *nodes)
263{
264    struct mempolicy *policy;
265
266    pr_debug("setting mode %d flags %d nodes[0] %lx\n",
267         mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
268
269    if (mode == MPOL_DEFAULT) {
270        if (nodes && !nodes_empty(*nodes))
271            return ERR_PTR(-EINVAL);
272        return NULL;
273    }
274    VM_BUG_ON(!nodes);
275
276    /*
277     * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
278     * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
279     * All other modes require a valid pointer to a non-empty nodemask.
280     */
281    if (mode == MPOL_PREFERRED) {
282        if (nodes_empty(*nodes)) {
283            if (((flags & MPOL_F_STATIC_NODES) ||
284                 (flags & MPOL_F_RELATIVE_NODES)))
285                return ERR_PTR(-EINVAL);
286        }
287    } else if (mode == MPOL_LOCAL) {
288        if (!nodes_empty(*nodes))
289            return ERR_PTR(-EINVAL);
290        mode = MPOL_PREFERRED;
291    } else if (nodes_empty(*nodes))
292        return ERR_PTR(-EINVAL);
293    policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
294    if (!policy)
295        return ERR_PTR(-ENOMEM);
296    atomic_set(&policy->refcnt, 1);
297    policy->mode = mode;
298    policy->flags = flags;
299
300    return policy;
301}
302
303/* Slow path of a mpol destructor. */
304void __mpol_put(struct mempolicy *p)
305{
306    if (!atomic_dec_and_test(&p->refcnt))
307        return;
308    kmem_cache_free(policy_cache, p);
309}
310
311static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
312                enum mpol_rebind_step step)
313{
314}
315
316/*
317 * step:
318 * MPOL_REBIND_ONCE - do rebind work at once
319 * MPOL_REBIND_STEP1 - set all the newly nodes
320 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
321 */
322static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
323                 enum mpol_rebind_step step)
324{
325    nodemask_t tmp;
326
327    if (pol->flags & MPOL_F_STATIC_NODES)
328        nodes_and(tmp, pol->w.user_nodemask, *nodes);
329    else if (pol->flags & MPOL_F_RELATIVE_NODES)
330        mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
331    else {
332        /*
333         * if step == 1, we use ->w.cpuset_mems_allowed to cache the
334         * result
335         */
336        if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
337            nodes_remap(tmp, pol->v.nodes,
338                    pol->w.cpuset_mems_allowed, *nodes);
339            pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
340        } else if (step == MPOL_REBIND_STEP2) {
341            tmp = pol->w.cpuset_mems_allowed;
342            pol->w.cpuset_mems_allowed = *nodes;
343        } else
344            BUG();
345    }
346
347    if (nodes_empty(tmp))
348        tmp = *nodes;
349
350    if (step == MPOL_REBIND_STEP1)
351        nodes_or(pol->v.nodes, pol->v.nodes, tmp);
352    else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
353        pol->v.nodes = tmp;
354    else
355        BUG();
356
357    if (!node_isset(current->il_next, tmp)) {
358        current->il_next = next_node(current->il_next, tmp);
359        if (current->il_next >= MAX_NUMNODES)
360            current->il_next = first_node(tmp);
361        if (current->il_next >= MAX_NUMNODES)
362            current->il_next = numa_node_id();
363    }
364}
365
366static void mpol_rebind_preferred(struct mempolicy *pol,
367                  const nodemask_t *nodes,
368                  enum mpol_rebind_step step)
369{
370    nodemask_t tmp;
371
372    if (pol->flags & MPOL_F_STATIC_NODES) {
373        int node = first_node(pol->w.user_nodemask);
374
375        if (node_isset(node, *nodes)) {
376            pol->v.preferred_node = node;
377            pol->flags &= ~MPOL_F_LOCAL;
378        } else
379            pol->flags |= MPOL_F_LOCAL;
380    } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
381        mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
382        pol->v.preferred_node = first_node(tmp);
383    } else if (!(pol->flags & MPOL_F_LOCAL)) {
384        pol->v.preferred_node = node_remap(pol->v.preferred_node,
385                           pol->w.cpuset_mems_allowed,
386                           *nodes);
387        pol->w.cpuset_mems_allowed = *nodes;
388    }
389}
390
391/*
392 * mpol_rebind_policy - Migrate a policy to a different set of nodes
393 *
394 * If read-side task has no lock to protect task->mempolicy, write-side
395 * task will rebind the task->mempolicy by two step. The first step is
396 * setting all the newly nodes, and the second step is cleaning all the
397 * disallowed nodes. In this way, we can avoid finding no node to alloc
398 * page.
399 * If we have a lock to protect task->mempolicy in read-side, we do
400 * rebind directly.
401 *
402 * step:
403 * MPOL_REBIND_ONCE - do rebind work at once
404 * MPOL_REBIND_STEP1 - set all the newly nodes
405 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
406 */
407static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
408                enum mpol_rebind_step step)
409{
410    if (!pol)
411        return;
412    if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
413        nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
414        return;
415
416    if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
417        return;
418
419    if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
420        BUG();
421
422    if (step == MPOL_REBIND_STEP1)
423        pol->flags |= MPOL_F_REBINDING;
424    else if (step == MPOL_REBIND_STEP2)
425        pol->flags &= ~MPOL_F_REBINDING;
426    else if (step >= MPOL_REBIND_NSTEP)
427        BUG();
428
429    mpol_ops[pol->mode].rebind(pol, newmask, step);
430}
431
432/*
433 * Wrapper for mpol_rebind_policy() that just requires task
434 * pointer, and updates task mempolicy.
435 *
436 * Called with task's alloc_lock held.
437 */
438
439void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
440            enum mpol_rebind_step step)
441{
442    mpol_rebind_policy(tsk->mempolicy, new, step);
443}
444
445/*
446 * Rebind each vma in mm to new nodemask.
447 *
448 * Call holding a reference to mm. Takes mm->mmap_sem during call.
449 */
450
451void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
452{
453    struct vm_area_struct *vma;
454
455    down_write(&mm->mmap_sem);
456    for (vma = mm->mmap; vma; vma = vma->vm_next)
457        mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
458    up_write(&mm->mmap_sem);
459}
460
461static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
462    [MPOL_DEFAULT] = {
463        .rebind = mpol_rebind_default,
464    },
465    [MPOL_INTERLEAVE] = {
466        .create = mpol_new_interleave,
467        .rebind = mpol_rebind_nodemask,
468    },
469    [MPOL_PREFERRED] = {
470        .create = mpol_new_preferred,
471        .rebind = mpol_rebind_preferred,
472    },
473    [MPOL_BIND] = {
474        .create = mpol_new_bind,
475        .rebind = mpol_rebind_nodemask,
476    },
477};
478
479static void migrate_page_add(struct page *page, struct list_head *pagelist,
480                unsigned long flags);
481
482/*
483 * Scan through pages checking if pages follow certain conditions,
484 * and move them to the pagelist if they do.
485 */
486static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
487        unsigned long addr, unsigned long end,
488        const nodemask_t *nodes, unsigned long flags,
489        void *private)
490{
491    pte_t *orig_pte;
492    pte_t *pte;
493    spinlock_t *ptl;
494
495    orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
496    do {
497        struct page *page;
498        int nid;
499
500        if (!pte_present(*pte))
501            continue;
502        page = vm_normal_page(vma, addr, *pte);
503        if (!page)
504            continue;
505        /*
506         * vm_normal_page() filters out zero pages, but there might
507         * still be PageReserved pages to skip, perhaps in a VDSO.
508         */
509        if (PageReserved(page))
510            continue;
511        nid = page_to_nid(page);
512        if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
513            continue;
514
515        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
516            migrate_page_add(page, private, flags);
517        else
518            break;
519    } while (pte++, addr += PAGE_SIZE, addr != end);
520    pte_unmap_unlock(orig_pte, ptl);
521    return addr != end;
522}
523
524static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
525        pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
526                    void *private)
527{
528#ifdef CONFIG_HUGETLB_PAGE
529    int nid;
530    struct page *page;
531    spinlock_t *ptl;
532    pte_t entry;
533
534    ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
535    entry = huge_ptep_get((pte_t *)pmd);
536    if (!pte_present(entry))
537        goto unlock;
538    page = pte_page(entry);
539    nid = page_to_nid(page);
540    if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
541        goto unlock;
542    /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
543    if (flags & (MPOL_MF_MOVE_ALL) ||
544        (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
545        isolate_huge_page(page, private);
546unlock:
547    spin_unlock(ptl);
548#else
549    BUG();
550#endif
551}
552
553static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
554        unsigned long addr, unsigned long end,
555        const nodemask_t *nodes, unsigned long flags,
556        void *private)
557{
558    pmd_t *pmd;
559    unsigned long next;
560
561    pmd = pmd_offset(pud, addr);
562    do {
563        next = pmd_addr_end(addr, end);
564        if (!pmd_present(*pmd))
565            continue;
566        if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
567            queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
568                        flags, private);
569            continue;
570        }
571        split_huge_page_pmd(vma, addr, pmd);
572        if (pmd_none_or_trans_huge_or_clear_bad(pmd))
573            continue;
574        if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
575                    flags, private))
576            return -EIO;
577    } while (pmd++, addr = next, addr != end);
578    return 0;
579}
580
581static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
582        unsigned long addr, unsigned long end,
583        const nodemask_t *nodes, unsigned long flags,
584        void *private)
585{
586    pud_t *pud;
587    unsigned long next;
588
589    pud = pud_offset(pgd, addr);
590    do {
591        next = pud_addr_end(addr, end);
592        if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
593            continue;
594        if (pud_none_or_clear_bad(pud))
595            continue;
596        if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
597                    flags, private))
598            return -EIO;
599    } while (pud++, addr = next, addr != end);
600    return 0;
601}
602
603static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
604        unsigned long addr, unsigned long end,
605        const nodemask_t *nodes, unsigned long flags,
606        void *private)
607{
608    pgd_t *pgd;
609    unsigned long next;
610
611    pgd = pgd_offset(vma->vm_mm, addr);
612    do {
613        next = pgd_addr_end(addr, end);
614        if (pgd_none_or_clear_bad(pgd))
615            continue;
616        if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
617                    flags, private))
618            return -EIO;
619    } while (pgd++, addr = next, addr != end);
620    return 0;
621}
622
623#ifdef CONFIG_NUMA_BALANCING
624/*
625 * This is used to mark a range of virtual addresses to be inaccessible.
626 * These are later cleared by a NUMA hinting fault. Depending on these
627 * faults, pages may be migrated for better NUMA placement.
628 *
629 * This is assuming that NUMA faults are handled using PROT_NONE. If
630 * an architecture makes a different choice, it will need further
631 * changes to the core.
632 */
633unsigned long change_prot_numa(struct vm_area_struct *vma,
634            unsigned long addr, unsigned long end)
635{
636    int nr_updated;
637
638    nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
639    if (nr_updated)
640        count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
641
642    return nr_updated;
643}
644#else
645static unsigned long change_prot_numa(struct vm_area_struct *vma,
646            unsigned long addr, unsigned long end)
647{
648    return 0;
649}
650#endif /* CONFIG_NUMA_BALANCING */
651
652/*
653 * Walk through page tables and collect pages to be migrated.
654 *
655 * If pages found in a given range are on a set of nodes (determined by
656 * @nodes and @flags,) it's isolated and queued to the pagelist which is
657 * passed via @private.)
658 */
659static int
660queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
661        const nodemask_t *nodes, unsigned long flags, void *private)
662{
663    int err = 0;
664    struct vm_area_struct *vma, *prev;
665
666    vma = find_vma(mm, start);
667    if (!vma)
668        return -EFAULT;
669    prev = NULL;
670    for (; vma && vma->vm_start < end; vma = vma->vm_next) {
671        unsigned long endvma = vma->vm_end;
672
673        if (endvma > end)
674            endvma = end;
675        if (vma->vm_start > start)
676            start = vma->vm_start;
677
678        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
679            if (!vma->vm_next && vma->vm_end < end)
680                return -EFAULT;
681            if (prev && prev->vm_end < vma->vm_start)
682                return -EFAULT;
683        }
684
685        if (flags & MPOL_MF_LAZY) {
686            change_prot_numa(vma, start, endvma);
687            goto next;
688        }
689
690        if ((flags & MPOL_MF_STRICT) ||
691             ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
692              vma_migratable(vma))) {
693
694            err = queue_pages_pgd_range(vma, start, endvma, nodes,
695                        flags, private);
696            if (err)
697                break;
698        }
699next:
700        prev = vma;
701    }
702    return err;
703}
704
705/*
706 * Apply policy to a single VMA
707 * This must be called with the mmap_sem held for writing.
708 */
709static int vma_replace_policy(struct vm_area_struct *vma,
710                        struct mempolicy *pol)
711{
712    int err;
713    struct mempolicy *old;
714    struct mempolicy *new;
715
716    pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
717         vma->vm_start, vma->vm_end, vma->vm_pgoff,
718         vma->vm_ops, vma->vm_file,
719         vma->vm_ops ? vma->vm_ops->set_policy : NULL);
720
721    new = mpol_dup(pol);
722    if (IS_ERR(new))
723        return PTR_ERR(new);
724
725    if (vma->vm_ops && vma->vm_ops->set_policy) {
726        err = vma->vm_ops->set_policy(vma, new);
727        if (err)
728            goto err_out;
729    }
730
731    old = vma->vm_policy;
732    vma->vm_policy = new; /* protected by mmap_sem */
733    mpol_put(old);
734
735    return 0;
736 err_out:
737    mpol_put(new);
738    return err;
739}
740
741/* Step 2: apply policy to a range and do splits. */
742static int mbind_range(struct mm_struct *mm, unsigned long start,
743               unsigned long end, struct mempolicy *new_pol)
744{
745    struct vm_area_struct *next;
746    struct vm_area_struct *prev;
747    struct vm_area_struct *vma;
748    int err = 0;
749    pgoff_t pgoff;
750    unsigned long vmstart;
751    unsigned long vmend;
752
753    vma = find_vma(mm, start);
754    if (!vma || vma->vm_start > start)
755        return -EFAULT;
756
757    prev = vma->vm_prev;
758    if (start > vma->vm_start)
759        prev = vma;
760
761    for (; vma && vma->vm_start < end; prev = vma, vma = next) {
762        next = vma->vm_next;
763        vmstart = max(start, vma->vm_start);
764        vmend = min(end, vma->vm_end);
765
766        if (mpol_equal(vma_policy(vma), new_pol))
767            continue;
768
769        pgoff = vma->vm_pgoff +
770            ((vmstart - vma->vm_start) >> PAGE_SHIFT);
771        prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
772                  vma->anon_vma, vma->vm_file, pgoff,
773                  new_pol);
774        if (prev) {
775            vma = prev;
776            next = vma->vm_next;
777            if (mpol_equal(vma_policy(vma), new_pol))
778                continue;
779            /* vma_merge() joined vma && vma->next, case 8 */
780            goto replace;
781        }
782        if (vma->vm_start != vmstart) {
783            err = split_vma(vma->vm_mm, vma, vmstart, 1);
784            if (err)
785                goto out;
786        }
787        if (vma->vm_end != vmend) {
788            err = split_vma(vma->vm_mm, vma, vmend, 0);
789            if (err)
790                goto out;
791        }
792 replace:
793        err = vma_replace_policy(vma, new_pol);
794        if (err)
795            goto out;
796    }
797
798 out:
799    return err;
800}
801
802/* Set the process memory policy */
803static long do_set_mempolicy(unsigned short mode, unsigned short flags,
804                 nodemask_t *nodes)
805{
806    struct mempolicy *new, *old;
807    struct mm_struct *mm = current->mm;
808    NODEMASK_SCRATCH(scratch);
809    int ret;
810
811    if (!scratch)
812        return -ENOMEM;
813
814    new = mpol_new(mode, flags, nodes);
815    if (IS_ERR(new)) {
816        ret = PTR_ERR(new);
817        goto out;
818    }
819    /*
820     * prevent changing our mempolicy while show_numa_maps()
821     * is using it.
822     * Note: do_set_mempolicy() can be called at init time
823     * with no 'mm'.
824     */
825    if (mm)
826        down_write(&mm->mmap_sem);
827    task_lock(current);
828    ret = mpol_set_nodemask(new, nodes, scratch);
829    if (ret) {
830        task_unlock(current);
831        if (mm)
832            up_write(&mm->mmap_sem);
833        mpol_put(new);
834        goto out;
835    }
836    old = current->mempolicy;
837    current->mempolicy = new;
838    if (new && new->mode == MPOL_INTERLEAVE &&
839        nodes_weight(new->v.nodes))
840        current->il_next = first_node(new->v.nodes);
841    task_unlock(current);
842    if (mm)
843        up_write(&mm->mmap_sem);
844
845    mpol_put(old);
846    ret = 0;
847out:
848    NODEMASK_SCRATCH_FREE(scratch);
849    return ret;
850}
851
852/*
853 * Return nodemask for policy for get_mempolicy() query
854 *
855 * Called with task's alloc_lock held
856 */
857static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
858{
859    nodes_clear(*nodes);
860    if (p == &default_policy)
861        return;
862
863    switch (p->mode) {
864    case MPOL_BIND:
865        /* Fall through */
866    case MPOL_INTERLEAVE:
867        *nodes = p->v.nodes;
868        break;
869    case MPOL_PREFERRED:
870        if (!(p->flags & MPOL_F_LOCAL))
871            node_set(p->v.preferred_node, *nodes);
872        /* else return empty node mask for local allocation */
873        break;
874    default:
875        BUG();
876    }
877}
878
879static int lookup_node(struct mm_struct *mm, unsigned long addr)
880{
881    struct page *p;
882    int err;
883
884    err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
885    if (err >= 0) {
886        err = page_to_nid(p);
887        put_page(p);
888    }
889    return err;
890}
891
892/* Retrieve NUMA policy */
893static long do_get_mempolicy(int *policy, nodemask_t *nmask,
894                 unsigned long addr, unsigned long flags)
895{
896    int err;
897    struct mm_struct *mm = current->mm;
898    struct vm_area_struct *vma = NULL;
899    struct mempolicy *pol = current->mempolicy;
900
901    if (flags &
902        ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
903        return -EINVAL;
904
905    if (flags & MPOL_F_MEMS_ALLOWED) {
906        if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
907            return -EINVAL;
908        *policy = 0; /* just so it's initialized */
909        task_lock(current);
910        *nmask = cpuset_current_mems_allowed;
911        task_unlock(current);
912        return 0;
913    }
914
915    if (flags & MPOL_F_ADDR) {
916        /*
917         * Do NOT fall back to task policy if the
918         * vma/shared policy at addr is NULL. We
919         * want to return MPOL_DEFAULT in this case.
920         */
921        down_read(&mm->mmap_sem);
922        vma = find_vma_intersection(mm, addr, addr+1);
923        if (!vma) {
924            up_read(&mm->mmap_sem);
925            return -EFAULT;
926        }
927        if (vma->vm_ops && vma->vm_ops->get_policy)
928            pol = vma->vm_ops->get_policy(vma, addr);
929        else
930            pol = vma->vm_policy;
931    } else if (addr)
932        return -EINVAL;
933
934    if (!pol)
935        pol = &default_policy; /* indicates default behavior */
936
937    if (flags & MPOL_F_NODE) {
938        if (flags & MPOL_F_ADDR) {
939            err = lookup_node(mm, addr);
940            if (err < 0)
941                goto out;
942            *policy = err;
943        } else if (pol == current->mempolicy &&
944                pol->mode == MPOL_INTERLEAVE) {
945            *policy = current->il_next;
946        } else {
947            err = -EINVAL;
948            goto out;
949        }
950    } else {
951        *policy = pol == &default_policy ? MPOL_DEFAULT :
952                        pol->mode;
953        /*
954         * Internal mempolicy flags must be masked off before exposing
955         * the policy to userspace.
956         */
957        *policy |= (pol->flags & MPOL_MODE_FLAGS);
958    }
959
960    if (vma) {
961        up_read(&current->mm->mmap_sem);
962        vma = NULL;
963    }
964
965    err = 0;
966    if (nmask) {
967        if (mpol_store_user_nodemask(pol)) {
968            *nmask = pol->w.user_nodemask;
969        } else {
970            task_lock(current);
971            get_policy_nodemask(pol, nmask);
972            task_unlock(current);
973        }
974    }
975
976 out:
977    mpol_cond_put(pol);
978    if (vma)
979        up_read(&current->mm->mmap_sem);
980    return err;
981}
982
983#ifdef CONFIG_MIGRATION
984/*
985 * page migration
986 */
987static void migrate_page_add(struct page *page, struct list_head *pagelist,
988                unsigned long flags)
989{
990    /*
991     * Avoid migrating a page that is shared with others.
992     */
993    if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
994        if (!isolate_lru_page(page)) {
995            list_add_tail(&page->lru, pagelist);
996            inc_zone_page_state(page, NR_ISOLATED_ANON +
997                        page_is_file_cache(page));
998        }
999    }
1000}
1001
1002static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1003{
1004    if (PageHuge(page))
1005        return alloc_huge_page_node(page_hstate(compound_head(page)),
1006                    node);
1007    else
1008        return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1009}
1010
1011/*
1012 * Migrate pages from one node to a target node.
1013 * Returns error or the number of pages not migrated.
1014 */
1015static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1016               int flags)
1017{
1018    nodemask_t nmask;
1019    LIST_HEAD(pagelist);
1020    int err = 0;
1021
1022    nodes_clear(nmask);
1023    node_set(source, nmask);
1024
1025    /*
1026     * This does not "check" the range but isolates all pages that
1027     * need migration. Between passing in the full user address
1028     * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1029     */
1030    VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1031    queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1032            flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1033
1034    if (!list_empty(&pagelist)) {
1035        err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1036                    MIGRATE_SYNC, MR_SYSCALL);
1037        if (err)
1038            putback_movable_pages(&pagelist);
1039    }
1040
1041    return err;
1042}
1043
1044/*
1045 * Move pages between the two nodesets so as to preserve the physical
1046 * layout as much as possible.
1047 *
1048 * Returns the number of page that could not be moved.
1049 */
1050int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1051             const nodemask_t *to, int flags)
1052{
1053    int busy = 0;
1054    int err;
1055    nodemask_t tmp;
1056
1057    err = migrate_prep();
1058    if (err)
1059        return err;
1060
1061    down_read(&mm->mmap_sem);
1062
1063    err = migrate_vmas(mm, from, to, flags);
1064    if (err)
1065        goto out;
1066
1067    /*
1068     * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1069     * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1070     * bit in 'tmp', and return that <source, dest> pair for migration.
1071     * The pair of nodemasks 'to' and 'from' define the map.
1072     *
1073     * If no pair of bits is found that way, fallback to picking some
1074     * pair of 'source' and 'dest' bits that are not the same. If the
1075     * 'source' and 'dest' bits are the same, this represents a node
1076     * that will be migrating to itself, so no pages need move.
1077     *
1078     * If no bits are left in 'tmp', or if all remaining bits left
1079     * in 'tmp' correspond to the same bit in 'to', return false
1080     * (nothing left to migrate).
1081     *
1082     * This lets us pick a pair of nodes to migrate between, such that
1083     * if possible the dest node is not already occupied by some other
1084     * source node, minimizing the risk of overloading the memory on a
1085     * node that would happen if we migrated incoming memory to a node
1086     * before migrating outgoing memory source that same node.
1087     *
1088     * A single scan of tmp is sufficient. As we go, we remember the
1089     * most recent <s, d> pair that moved (s != d). If we find a pair
1090     * that not only moved, but what's better, moved to an empty slot
1091     * (d is not set in tmp), then we break out then, with that pair.
1092     * Otherwise when we finish scanning from_tmp, we at least have the
1093     * most recent <s, d> pair that moved. If we get all the way through
1094     * the scan of tmp without finding any node that moved, much less
1095     * moved to an empty node, then there is nothing left worth migrating.
1096     */
1097
1098    tmp = *from;
1099    while (!nodes_empty(tmp)) {
1100        int s,d;
1101        int source = NUMA_NO_NODE;
1102        int dest = 0;
1103
1104        for_each_node_mask(s, tmp) {
1105
1106            /*
1107             * do_migrate_pages() tries to maintain the relative
1108             * node relationship of the pages established between
1109             * threads and memory areas.
1110                         *
1111             * However if the number of source nodes is not equal to
1112             * the number of destination nodes we can not preserve
1113             * this node relative relationship. In that case, skip
1114             * copying memory from a node that is in the destination
1115             * mask.
1116             *
1117             * Example: [2,3,4] -> [3,4,5] moves everything.
1118             * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1119             */
1120
1121            if ((nodes_weight(*from) != nodes_weight(*to)) &&
1122                        (node_isset(s, *to)))
1123                continue;
1124
1125            d = node_remap(s, *from, *to);
1126            if (s == d)
1127                continue;
1128
1129            source = s; /* Node moved. Memorize */
1130            dest = d;
1131
1132            /* dest not in remaining from nodes? */
1133            if (!node_isset(dest, tmp))
1134                break;
1135        }
1136        if (source == NUMA_NO_NODE)
1137            break;
1138
1139        node_clear(source, tmp);
1140        err = migrate_to_node(mm, source, dest, flags);
1141        if (err > 0)
1142            busy += err;
1143        if (err < 0)
1144            break;
1145    }
1146out:
1147    up_read(&mm->mmap_sem);
1148    if (err < 0)
1149        return err;
1150    return busy;
1151
1152}
1153
1154/*
1155 * Allocate a new page for page migration based on vma policy.
1156 * Start by assuming the page is mapped by the same vma as contains @start.
1157 * Search forward from there, if not. N.B., this assumes that the
1158 * list of pages handed to migrate_pages()--which is how we get here--
1159 * is in virtual address order.
1160 */
1161static struct page *new_page(struct page *page, unsigned long start, int **x)
1162{
1163    struct vm_area_struct *vma;
1164    unsigned long uninitialized_var(address);
1165
1166    vma = find_vma(current->mm, start);
1167    while (vma) {
1168        address = page_address_in_vma(page, vma);
1169        if (address != -EFAULT)
1170            break;
1171        vma = vma->vm_next;
1172    }
1173
1174    if (PageHuge(page)) {
1175        BUG_ON(!vma);
1176        return alloc_huge_page_noerr(vma, address, 1);
1177    }
1178    /*
1179     * if !vma, alloc_page_vma() will use task or system default policy
1180     */
1181    return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1182}
1183#else
1184
1185static void migrate_page_add(struct page *page, struct list_head *pagelist,
1186                unsigned long flags)
1187{
1188}
1189
1190int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1191             const nodemask_t *to, int flags)
1192{
1193    return -ENOSYS;
1194}
1195
1196static struct page *new_page(struct page *page, unsigned long start, int **x)
1197{
1198    return NULL;
1199}
1200#endif
1201
1202static long do_mbind(unsigned long start, unsigned long len,
1203             unsigned short mode, unsigned short mode_flags,
1204             nodemask_t *nmask, unsigned long flags)
1205{
1206    struct mm_struct *mm = current->mm;
1207    struct mempolicy *new;
1208    unsigned long end;
1209    int err;
1210    LIST_HEAD(pagelist);
1211
1212    if (flags & ~(unsigned long)MPOL_MF_VALID)
1213        return -EINVAL;
1214    if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1215        return -EPERM;
1216
1217    if (start & ~PAGE_MASK)
1218        return -EINVAL;
1219
1220    if (mode == MPOL_DEFAULT)
1221        flags &= ~MPOL_MF_STRICT;
1222
1223    len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1224    end = start + len;
1225
1226    if (end < start)
1227        return -EINVAL;
1228    if (end == start)
1229        return 0;
1230
1231    new = mpol_new(mode, mode_flags, nmask);
1232    if (IS_ERR(new))
1233        return PTR_ERR(new);
1234
1235    if (flags & MPOL_MF_LAZY)
1236        new->flags |= MPOL_F_MOF;
1237
1238    /*
1239     * If we are using the default policy then operation
1240     * on discontinuous address spaces is okay after all
1241     */
1242    if (!new)
1243        flags |= MPOL_MF_DISCONTIG_OK;
1244
1245    pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1246         start, start + len, mode, mode_flags,
1247         nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1248
1249    if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1250
1251        err = migrate_prep();
1252        if (err)
1253            goto mpol_out;
1254    }
1255    {
1256        NODEMASK_SCRATCH(scratch);
1257        if (scratch) {
1258            down_write(&mm->mmap_sem);
1259            task_lock(current);
1260            err = mpol_set_nodemask(new, nmask, scratch);
1261            task_unlock(current);
1262            if (err)
1263                up_write(&mm->mmap_sem);
1264        } else
1265            err = -ENOMEM;
1266        NODEMASK_SCRATCH_FREE(scratch);
1267    }
1268    if (err)
1269        goto mpol_out;
1270
1271    err = queue_pages_range(mm, start, end, nmask,
1272              flags | MPOL_MF_INVERT, &pagelist);
1273    if (!err)
1274        err = mbind_range(mm, start, end, new);
1275
1276    if (!err) {
1277        int nr_failed = 0;
1278
1279        if (!list_empty(&pagelist)) {
1280            WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1281            nr_failed = migrate_pages(&pagelist, new_page, NULL,
1282                start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1283            if (nr_failed)
1284                putback_movable_pages(&pagelist);
1285        }
1286
1287        if (nr_failed && (flags & MPOL_MF_STRICT))
1288            err = -EIO;
1289    } else
1290        putback_movable_pages(&pagelist);
1291
1292    up_write(&mm->mmap_sem);
1293 mpol_out:
1294    mpol_put(new);
1295    return err;
1296}
1297
1298/*
1299 * User space interface with variable sized bitmaps for nodelists.
1300 */
1301
1302/* Copy a node mask from user space. */
1303static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1304             unsigned long maxnode)
1305{
1306    unsigned long k;
1307    unsigned long nlongs;
1308    unsigned long endmask;
1309
1310    --maxnode;
1311    nodes_clear(*nodes);
1312    if (maxnode == 0 || !nmask)
1313        return 0;
1314    if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1315        return -EINVAL;
1316
1317    nlongs = BITS_TO_LONGS(maxnode);
1318    if ((maxnode % BITS_PER_LONG) == 0)
1319        endmask = ~0UL;
1320    else
1321        endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1322
1323    /* When the user specified more nodes than supported just check
1324       if the non supported part is all zero. */
1325    if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1326        if (nlongs > PAGE_SIZE/sizeof(long))
1327            return -EINVAL;
1328        for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1329            unsigned long t;
1330            if (get_user(t, nmask + k))
1331                return -EFAULT;
1332            if (k == nlongs - 1) {
1333                if (t & endmask)
1334                    return -EINVAL;
1335            } else if (t)
1336                return -EINVAL;
1337        }
1338        nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1339        endmask = ~0UL;
1340    }
1341
1342    if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1343        return -EFAULT;
1344    nodes_addr(*nodes)[nlongs-1] &= endmask;
1345    return 0;
1346}
1347
1348/* Copy a kernel node mask to user space */
1349static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1350                  nodemask_t *nodes)
1351{
1352    unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1353    const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1354
1355    if (copy > nbytes) {
1356        if (copy > PAGE_SIZE)
1357            return -EINVAL;
1358        if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1359            return -EFAULT;
1360        copy = nbytes;
1361    }
1362    return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1363}
1364
1365SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1366        unsigned long, mode, const unsigned long __user *, nmask,
1367        unsigned long, maxnode, unsigned, flags)
1368{
1369    nodemask_t nodes;
1370    int err;
1371    unsigned short mode_flags;
1372
1373    mode_flags = mode & MPOL_MODE_FLAGS;
1374    mode &= ~MPOL_MODE_FLAGS;
1375    if (mode >= MPOL_MAX)
1376        return -EINVAL;
1377    if ((mode_flags & MPOL_F_STATIC_NODES) &&
1378        (mode_flags & MPOL_F_RELATIVE_NODES))
1379        return -EINVAL;
1380    err = get_nodes(&nodes, nmask, maxnode);
1381    if (err)
1382        return err;
1383    return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1384}
1385
1386/* Set the process memory policy */
1387SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1388        unsigned long, maxnode)
1389{
1390    int err;
1391    nodemask_t nodes;
1392    unsigned short flags;
1393
1394    flags = mode & MPOL_MODE_FLAGS;
1395    mode &= ~MPOL_MODE_FLAGS;
1396    if ((unsigned int)mode >= MPOL_MAX)
1397        return -EINVAL;
1398    if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1399        return -EINVAL;
1400    err = get_nodes(&nodes, nmask, maxnode);
1401    if (err)
1402        return err;
1403    return do_set_mempolicy(mode, flags, &nodes);
1404}
1405
1406SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1407        const unsigned long __user *, old_nodes,
1408        const unsigned long __user *, new_nodes)
1409{
1410    const struct cred *cred = current_cred(), *tcred;
1411    struct mm_struct *mm = NULL;
1412    struct task_struct *task;
1413    nodemask_t task_nodes;
1414    int err;
1415    nodemask_t *old;
1416    nodemask_t *new;
1417    NODEMASK_SCRATCH(scratch);
1418
1419    if (!scratch)
1420        return -ENOMEM;
1421
1422    old = &scratch->mask1;
1423    new = &scratch->mask2;
1424
1425    err = get_nodes(old, old_nodes, maxnode);
1426    if (err)
1427        goto out;
1428
1429    err = get_nodes(new, new_nodes, maxnode);
1430    if (err)
1431        goto out;
1432
1433    /* Find the mm_struct */
1434    rcu_read_lock();
1435    task = pid ? find_task_by_vpid(pid) : current;
1436    if (!task) {
1437        rcu_read_unlock();
1438        err = -ESRCH;
1439        goto out;
1440    }
1441    get_task_struct(task);
1442
1443    err = -EINVAL;
1444
1445    /*
1446     * Check if this process has the right to modify the specified
1447     * process. The right exists if the process has administrative
1448     * capabilities, superuser privileges or the same
1449     * userid as the target process.
1450     */
1451    tcred = __task_cred(task);
1452    if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1453        !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1454        !capable(CAP_SYS_NICE)) {
1455        rcu_read_unlock();
1456        err = -EPERM;
1457        goto out_put;
1458    }
1459    rcu_read_unlock();
1460
1461    task_nodes = cpuset_mems_allowed(task);
1462    /* Is the user allowed to access the target nodes? */
1463    if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1464        err = -EPERM;
1465        goto out_put;
1466    }
1467
1468    if (!nodes_subset(*new, node_states[N_MEMORY])) {
1469        err = -EINVAL;
1470        goto out_put;
1471    }
1472
1473    err = security_task_movememory(task);
1474    if (err)
1475        goto out_put;
1476
1477    mm = get_task_mm(task);
1478    put_task_struct(task);
1479
1480    if (!mm) {
1481        err = -EINVAL;
1482        goto out;
1483    }
1484
1485    err = do_migrate_pages(mm, old, new,
1486        capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1487
1488    mmput(mm);
1489out:
1490    NODEMASK_SCRATCH_FREE(scratch);
1491
1492    return err;
1493
1494out_put:
1495    put_task_struct(task);
1496    goto out;
1497
1498}
1499
1500
1501/* Retrieve NUMA policy */
1502SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1503        unsigned long __user *, nmask, unsigned long, maxnode,
1504        unsigned long, addr, unsigned long, flags)
1505{
1506    int err;
1507    int uninitialized_var(pval);
1508    nodemask_t nodes;
1509
1510    if (nmask != NULL && maxnode < MAX_NUMNODES)
1511        return -EINVAL;
1512
1513    err = do_get_mempolicy(&pval, &nodes, addr, flags);
1514
1515    if (err)
1516        return err;
1517
1518    if (policy && put_user(pval, policy))
1519        return -EFAULT;
1520
1521    if (nmask)
1522        err = copy_nodes_to_user(nmask, maxnode, &nodes);
1523
1524    return err;
1525}
1526
1527#ifdef CONFIG_COMPAT
1528
1529COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1530               compat_ulong_t __user *, nmask,
1531               compat_ulong_t, maxnode,
1532               compat_ulong_t, addr, compat_ulong_t, flags)
1533{
1534    long err;
1535    unsigned long __user *nm = NULL;
1536    unsigned long nr_bits, alloc_size;
1537    DECLARE_BITMAP(bm, MAX_NUMNODES);
1538
1539    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1540    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1541
1542    if (nmask)
1543        nm = compat_alloc_user_space(alloc_size);
1544
1545    err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1546
1547    if (!err && nmask) {
1548        unsigned long copy_size;
1549        copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1550        err = copy_from_user(bm, nm, copy_size);
1551        /* ensure entire bitmap is zeroed */
1552        err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1553        err |= compat_put_bitmap(nmask, bm, nr_bits);
1554    }
1555
1556    return err;
1557}
1558
1559COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1560               compat_ulong_t, maxnode)
1561{
1562    long err = 0;
1563    unsigned long __user *nm = NULL;
1564    unsigned long nr_bits, alloc_size;
1565    DECLARE_BITMAP(bm, MAX_NUMNODES);
1566
1567    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1568    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1569
1570    if (nmask) {
1571        err = compat_get_bitmap(bm, nmask, nr_bits);
1572        nm = compat_alloc_user_space(alloc_size);
1573        err |= copy_to_user(nm, bm, alloc_size);
1574    }
1575
1576    if (err)
1577        return -EFAULT;
1578
1579    return sys_set_mempolicy(mode, nm, nr_bits+1);
1580}
1581
1582COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1583               compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1584               compat_ulong_t, maxnode, compat_ulong_t, flags)
1585{
1586    long err = 0;
1587    unsigned long __user *nm = NULL;
1588    unsigned long nr_bits, alloc_size;
1589    nodemask_t bm;
1590
1591    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1592    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1593
1594    if (nmask) {
1595        err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1596        nm = compat_alloc_user_space(alloc_size);
1597        err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1598    }
1599
1600    if (err)
1601        return -EFAULT;
1602
1603    return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1604}
1605
1606#endif
1607
1608/*
1609 * get_vma_policy(@task, @vma, @addr)
1610 * @task: task for fallback if vma policy == default
1611 * @vma: virtual memory area whose policy is sought
1612 * @addr: address in @vma for shared policy lookup
1613 *
1614 * Returns effective policy for a VMA at specified address.
1615 * Falls back to @task or system default policy, as necessary.
1616 * Current or other task's task mempolicy and non-shared vma policies must be
1617 * protected by task_lock(task) by the caller.
1618 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1619 * count--added by the get_policy() vm_op, as appropriate--to protect against
1620 * freeing by another task. It is the caller's responsibility to free the
1621 * extra reference for shared policies.
1622 */
1623struct mempolicy *get_vma_policy(struct task_struct *task,
1624        struct vm_area_struct *vma, unsigned long addr)
1625{
1626    struct mempolicy *pol = get_task_policy(task);
1627
1628    if (vma) {
1629        if (vma->vm_ops && vma->vm_ops->get_policy) {
1630            struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1631                                    addr);
1632            if (vpol)
1633                pol = vpol;
1634        } else if (vma->vm_policy) {
1635            pol = vma->vm_policy;
1636
1637            /*
1638             * shmem_alloc_page() passes MPOL_F_SHARED policy with
1639             * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1640             * count on these policies which will be dropped by
1641             * mpol_cond_put() later
1642             */
1643            if (mpol_needs_cond_ref(pol))
1644                mpol_get(pol);
1645        }
1646    }
1647    if (!pol)
1648        pol = &default_policy;
1649    return pol;
1650}
1651
1652bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1653{
1654    struct mempolicy *pol = get_task_policy(task);
1655    if (vma) {
1656        if (vma->vm_ops && vma->vm_ops->get_policy) {
1657            bool ret = false;
1658
1659            pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1660            if (pol && (pol->flags & MPOL_F_MOF))
1661                ret = true;
1662            mpol_cond_put(pol);
1663
1664            return ret;
1665        } else if (vma->vm_policy) {
1666            pol = vma->vm_policy;
1667        }
1668    }
1669
1670    if (!pol)
1671        return default_policy.flags & MPOL_F_MOF;
1672
1673    return pol->flags & MPOL_F_MOF;
1674}
1675
1676static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1677{
1678    enum zone_type dynamic_policy_zone = policy_zone;
1679
1680    BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1681
1682    /*
1683     * if policy->v.nodes has movable memory only,
1684     * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1685     *
1686     * policy->v.nodes is intersect with node_states[N_MEMORY].
1687     * so if the following test faile, it implies
1688     * policy->v.nodes has movable memory only.
1689     */
1690    if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1691        dynamic_policy_zone = ZONE_MOVABLE;
1692
1693    return zone >= dynamic_policy_zone;
1694}
1695
1696/*
1697 * Return a nodemask representing a mempolicy for filtering nodes for
1698 * page allocation
1699 */
1700static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1701{
1702    /* Lower zones don't get a nodemask applied for MPOL_BIND */
1703    if (unlikely(policy->mode == MPOL_BIND) &&
1704            apply_policy_zone(policy, gfp_zone(gfp)) &&
1705            cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1706        return &policy->v.nodes;
1707
1708    return NULL;
1709}
1710
1711/* Return a zonelist indicated by gfp for node representing a mempolicy */
1712static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1713    int nd)
1714{
1715    switch (policy->mode) {
1716    case MPOL_PREFERRED:
1717        if (!(policy->flags & MPOL_F_LOCAL))
1718            nd = policy->v.preferred_node;
1719        break;
1720    case MPOL_BIND:
1721        /*
1722         * Normally, MPOL_BIND allocations are node-local within the
1723         * allowed nodemask. However, if __GFP_THISNODE is set and the
1724         * current node isn't part of the mask, we use the zonelist for
1725         * the first node in the mask instead.
1726         */
1727        if (unlikely(gfp & __GFP_THISNODE) &&
1728                unlikely(!node_isset(nd, policy->v.nodes)))
1729            nd = first_node(policy->v.nodes);
1730        break;
1731    default:
1732        BUG();
1733    }
1734    return node_zonelist(nd, gfp);
1735}
1736
1737/* Do dynamic interleaving for a process */
1738static unsigned interleave_nodes(struct mempolicy *policy)
1739{
1740    unsigned nid, next;
1741    struct task_struct *me = current;
1742
1743    nid = me->il_next;
1744    next = next_node(nid, policy->v.nodes);
1745    if (next >= MAX_NUMNODES)
1746        next = first_node(policy->v.nodes);
1747    if (next < MAX_NUMNODES)
1748        me->il_next = next;
1749    return nid;
1750}
1751
1752/*
1753 * Depending on the memory policy provide a node from which to allocate the
1754 * next slab entry.
1755 */
1756unsigned int mempolicy_slab_node(void)
1757{
1758    struct mempolicy *policy;
1759    int node = numa_mem_id();
1760
1761    if (in_interrupt())
1762        return node;
1763
1764    policy = current->mempolicy;
1765    if (!policy || policy->flags & MPOL_F_LOCAL)
1766        return node;
1767
1768    switch (policy->mode) {
1769    case MPOL_PREFERRED:
1770        /*
1771         * handled MPOL_F_LOCAL above
1772         */
1773        return policy->v.preferred_node;
1774
1775    case MPOL_INTERLEAVE:
1776        return interleave_nodes(policy);
1777
1778    case MPOL_BIND: {
1779        /*
1780         * Follow bind policy behavior and start allocation at the
1781         * first node.
1782         */
1783        struct zonelist *zonelist;
1784        struct zone *zone;
1785        enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1786        zonelist = &NODE_DATA(node)->node_zonelists[0];
1787        (void)first_zones_zonelist(zonelist, highest_zoneidx,
1788                            &policy->v.nodes,
1789                            &zone);
1790        return zone ? zone->node : node;
1791    }
1792
1793    default:
1794        BUG();
1795    }
1796}
1797
1798/* Do static interleaving for a VMA with known offset. */
1799static unsigned offset_il_node(struct mempolicy *pol,
1800        struct vm_area_struct *vma, unsigned long off)
1801{
1802    unsigned nnodes = nodes_weight(pol->v.nodes);
1803    unsigned target;
1804    int c;
1805    int nid = NUMA_NO_NODE;
1806
1807    if (!nnodes)
1808        return numa_node_id();
1809    target = (unsigned int)off % nnodes;
1810    c = 0;
1811    do {
1812        nid = next_node(nid, pol->v.nodes);
1813        c++;
1814    } while (c <= target);
1815    return nid;
1816}
1817
1818/* Determine a node number for interleave */
1819static inline unsigned interleave_nid(struct mempolicy *pol,
1820         struct vm_area_struct *vma, unsigned long addr, int shift)
1821{
1822    if (vma) {
1823        unsigned long off;
1824
1825        /*
1826         * for small pages, there is no difference between
1827         * shift and PAGE_SHIFT, so the bit-shift is safe.
1828         * for huge pages, since vm_pgoff is in units of small
1829         * pages, we need to shift off the always 0 bits to get
1830         * a useful offset.
1831         */
1832        BUG_ON(shift < PAGE_SHIFT);
1833        off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1834        off += (addr - vma->vm_start) >> shift;
1835        return offset_il_node(pol, vma, off);
1836    } else
1837        return interleave_nodes(pol);
1838}
1839
1840/*
1841 * Return the bit number of a random bit set in the nodemask.
1842 * (returns NUMA_NO_NODE if nodemask is empty)
1843 */
1844int node_random(const nodemask_t *maskp)
1845{
1846    int w, bit = NUMA_NO_NODE;
1847
1848    w = nodes_weight(*maskp);
1849    if (w)
1850        bit = bitmap_ord_to_pos(maskp->bits,
1851            get_random_int() % w, MAX_NUMNODES);
1852    return bit;
1853}
1854
1855#ifdef CONFIG_HUGETLBFS
1856/*
1857 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1858 * @vma: virtual memory area whose policy is sought
1859 * @addr: address in @vma for shared policy lookup and interleave policy
1860 * @gfp_flags: for requested zone
1861 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1862 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1863 *
1864 * Returns a zonelist suitable for a huge page allocation and a pointer
1865 * to the struct mempolicy for conditional unref after allocation.
1866 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1867 * @nodemask for filtering the zonelist.
1868 *
1869 * Must be protected by read_mems_allowed_begin()
1870 */
1871struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1872                gfp_t gfp_flags, struct mempolicy **mpol,
1873                nodemask_t **nodemask)
1874{
1875    struct zonelist *zl;
1876
1877    *mpol = get_vma_policy(current, vma, addr);
1878    *nodemask = NULL; /* assume !MPOL_BIND */
1879
1880    if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1881        zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1882                huge_page_shift(hstate_vma(vma))), gfp_flags);
1883    } else {
1884        zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1885        if ((*mpol)->mode == MPOL_BIND)
1886            *nodemask = &(*mpol)->v.nodes;
1887    }
1888    return zl;
1889}
1890
1891/*
1892 * init_nodemask_of_mempolicy
1893 *
1894 * If the current task's mempolicy is "default" [NULL], return 'false'
1895 * to indicate default policy. Otherwise, extract the policy nodemask
1896 * for 'bind' or 'interleave' policy into the argument nodemask, or
1897 * initialize the argument nodemask to contain the single node for
1898 * 'preferred' or 'local' policy and return 'true' to indicate presence
1899 * of non-default mempolicy.
1900 *
1901 * We don't bother with reference counting the mempolicy [mpol_get/put]
1902 * because the current task is examining it's own mempolicy and a task's
1903 * mempolicy is only ever changed by the task itself.
1904 *
1905 * N.B., it is the caller's responsibility to free a returned nodemask.
1906 */
1907bool init_nodemask_of_mempolicy(nodemask_t *mask)
1908{
1909    struct mempolicy *mempolicy;
1910    int nid;
1911
1912    if (!(mask && current->mempolicy))
1913        return false;
1914
1915    task_lock(current);
1916    mempolicy = current->mempolicy;
1917    switch (mempolicy->mode) {
1918    case MPOL_PREFERRED:
1919        if (mempolicy->flags & MPOL_F_LOCAL)
1920            nid = numa_node_id();
1921        else
1922            nid = mempolicy->v.preferred_node;
1923        init_nodemask_of_node(mask, nid);
1924        break;
1925
1926    case MPOL_BIND:
1927        /* Fall through */
1928    case MPOL_INTERLEAVE:
1929        *mask = mempolicy->v.nodes;
1930        break;
1931
1932    default:
1933        BUG();
1934    }
1935    task_unlock(current);
1936
1937    return true;
1938}
1939#endif
1940
1941/*
1942 * mempolicy_nodemask_intersects
1943 *
1944 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1945 * policy. Otherwise, check for intersection between mask and the policy
1946 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1947 * policy, always return true since it may allocate elsewhere on fallback.
1948 *
1949 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1950 */
1951bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1952                    const nodemask_t *mask)
1953{
1954    struct mempolicy *mempolicy;
1955    bool ret = true;
1956
1957    if (!mask)
1958        return ret;
1959    task_lock(tsk);
1960    mempolicy = tsk->mempolicy;
1961    if (!mempolicy)
1962        goto out;
1963
1964    switch (mempolicy->mode) {
1965    case MPOL_PREFERRED:
1966        /*
1967         * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1968         * allocate from, they may fallback to other nodes when oom.
1969         * Thus, it's possible for tsk to have allocated memory from
1970         * nodes in mask.
1971         */
1972        break;
1973    case MPOL_BIND:
1974    case MPOL_INTERLEAVE:
1975        ret = nodes_intersects(mempolicy->v.nodes, *mask);
1976        break;
1977    default:
1978        BUG();
1979    }
1980out:
1981    task_unlock(tsk);
1982    return ret;
1983}
1984
1985/* Allocate a page in interleaved policy.
1986   Own path because it needs to do special accounting. */
1987static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1988                    unsigned nid)
1989{
1990    struct zonelist *zl;
1991    struct page *page;
1992
1993    zl = node_zonelist(nid, gfp);
1994    page = __alloc_pages(gfp, order, zl);
1995    if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1996        inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1997    return page;
1998}
1999
2000/**
2001 * alloc_pages_vma - Allocate a page for a VMA.
2002 *
2003 * @gfp:
2004 * %GFP_USER user allocation.
2005 * %GFP_KERNEL kernel allocations,
2006 * %GFP_HIGHMEM highmem/user allocations,
2007 * %GFP_FS allocation should not call back into a file system.
2008 * %GFP_ATOMIC don't sleep.
2009 *
2010 * @order:Order of the GFP allocation.
2011 * @vma: Pointer to VMA or NULL if not available.
2012 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2013 *
2014 * This function allocates a page from the kernel page pool and applies
2015 * a NUMA policy associated with the VMA or the current process.
2016 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2017 * mm_struct of the VMA to prevent it from going away. Should be used for
2018 * all allocations for pages that will be mapped into
2019 * user space. Returns NULL when no page can be allocated.
2020 *
2021 * Should be called with the mm_sem of the vma hold.
2022 */
2023struct page *
2024alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2025        unsigned long addr, int node)
2026{
2027    struct mempolicy *pol;
2028    struct page *page;
2029    unsigned int cpuset_mems_cookie;
2030
2031retry_cpuset:
2032    pol = get_vma_policy(current, vma, addr);
2033    cpuset_mems_cookie = read_mems_allowed_begin();
2034
2035    if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2036        unsigned nid;
2037
2038        nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2039        mpol_cond_put(pol);
2040        page = alloc_page_interleave(gfp, order, nid);
2041        if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2042            goto retry_cpuset;
2043
2044        return page;
2045    }
2046    page = __alloc_pages_nodemask(gfp, order,
2047                      policy_zonelist(gfp, pol, node),
2048                      policy_nodemask(gfp, pol));
2049    if (unlikely(mpol_needs_cond_ref(pol)))
2050        __mpol_put(pol);
2051    if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2052        goto retry_cpuset;
2053    return page;
2054}
2055
2056/**
2057 * alloc_pages_current - Allocate pages.
2058 *
2059 * @gfp:
2060 * %GFP_USER user allocation,
2061 * %GFP_KERNEL kernel allocation,
2062 * %GFP_HIGHMEM highmem allocation,
2063 * %GFP_FS don't call back into a file system.
2064 * %GFP_ATOMIC don't sleep.
2065 * @order: Power of two of allocation size in pages. 0 is a single page.
2066 *
2067 * Allocate a page from the kernel page pool. When not in
2068 * interrupt context and apply the current process NUMA policy.
2069 * Returns NULL when no page can be allocated.
2070 *
2071 * Don't call cpuset_update_task_memory_state() unless
2072 * 1) it's ok to take cpuset_sem (can WAIT), and
2073 * 2) allocating for current task (not interrupt).
2074 */
2075struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2076{
2077    struct mempolicy *pol = get_task_policy(current);
2078    struct page *page;
2079    unsigned int cpuset_mems_cookie;
2080
2081    if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2082        pol = &default_policy;
2083
2084retry_cpuset:
2085    cpuset_mems_cookie = read_mems_allowed_begin();
2086
2087    /*
2088     * No reference counting needed for current->mempolicy
2089     * nor system default_policy
2090     */
2091    if (pol->mode == MPOL_INTERLEAVE)
2092        page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2093    else
2094        page = __alloc_pages_nodemask(gfp, order,
2095                policy_zonelist(gfp, pol, numa_node_id()),
2096                policy_nodemask(gfp, pol));
2097
2098    if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2099        goto retry_cpuset;
2100
2101    return page;
2102}
2103EXPORT_SYMBOL(alloc_pages_current);
2104
2105int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2106{
2107    struct mempolicy *pol = mpol_dup(vma_policy(src));
2108
2109    if (IS_ERR(pol))
2110        return PTR_ERR(pol);
2111    dst->vm_policy = pol;
2112    return 0;
2113}
2114
2115/*
2116 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2117 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2118 * with the mems_allowed returned by cpuset_mems_allowed(). This
2119 * keeps mempolicies cpuset relative after its cpuset moves. See
2120 * further kernel/cpuset.c update_nodemask().
2121 *
2122 * current's mempolicy may be rebinded by the other task(the task that changes
2123 * cpuset's mems), so we needn't do rebind work for current task.
2124 */
2125
2126/* Slow path of a mempolicy duplicate */
2127struct mempolicy *__mpol_dup(struct mempolicy *old)
2128{
2129    struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2130
2131    if (!new)
2132        return ERR_PTR(-ENOMEM);
2133
2134    /* task's mempolicy is protected by alloc_lock */
2135    if (old == current->mempolicy) {
2136        task_lock(current);
2137        *new = *old;
2138        task_unlock(current);
2139    } else
2140        *new = *old;
2141
2142    if (current_cpuset_is_being_rebound()) {
2143        nodemask_t mems = cpuset_mems_allowed(current);
2144        if (new->flags & MPOL_F_REBINDING)
2145            mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2146        else
2147            mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2148    }
2149    atomic_set(&new->refcnt, 1);
2150    return new;
2151}
2152
2153/* Slow path of a mempolicy comparison */
2154bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2155{
2156    if (!a || !b)
2157        return false;
2158    if (a->mode != b->mode)
2159        return false;
2160    if (a->flags != b->flags)
2161        return false;
2162    if (mpol_store_user_nodemask(a))
2163        if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2164            return false;
2165
2166    switch (a->mode) {
2167    case MPOL_BIND:
2168        /* Fall through */
2169    case MPOL_INTERLEAVE:
2170        return !!nodes_equal(a->v.nodes, b->v.nodes);
2171    case MPOL_PREFERRED:
2172        return a->v.preferred_node == b->v.preferred_node;
2173    default:
2174        BUG();
2175        return false;
2176    }
2177}
2178
2179/*
2180 * Shared memory backing store policy support.
2181 *
2182 * Remember policies even when nobody has shared memory mapped.
2183 * The policies are kept in Red-Black tree linked from the inode.
2184 * They are protected by the sp->lock spinlock, which should be held
2185 * for any accesses to the tree.
2186 */
2187
2188/* lookup first element intersecting start-end */
2189/* Caller holds sp->lock */
2190static struct sp_node *
2191sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2192{
2193    struct rb_node *n = sp->root.rb_node;
2194
2195    while (n) {
2196        struct sp_node *p = rb_entry(n, struct sp_node, nd);
2197
2198        if (start >= p->end)
2199            n = n->rb_right;
2200        else if (end <= p->start)
2201            n = n->rb_left;
2202        else
2203            break;
2204    }
2205    if (!n)
2206        return NULL;
2207    for (;;) {
2208        struct sp_node *w = NULL;
2209        struct rb_node *prev = rb_prev(n);
2210        if (!prev)
2211            break;
2212        w = rb_entry(prev, struct sp_node, nd);
2213        if (w->end <= start)
2214            break;
2215        n = prev;
2216    }
2217    return rb_entry(n, struct sp_node, nd);
2218}
2219
2220/* Insert a new shared policy into the list. */
2221/* Caller holds sp->lock */
2222static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2223{
2224    struct rb_node **p = &sp->root.rb_node;
2225    struct rb_node *parent = NULL;
2226    struct sp_node *nd;
2227
2228    while (*p) {
2229        parent = *p;
2230        nd = rb_entry(parent, struct sp_node, nd);
2231        if (new->start < nd->start)
2232            p = &(*p)->rb_left;
2233        else if (new->end > nd->end)
2234            p = &(*p)->rb_right;
2235        else
2236            BUG();
2237    }
2238    rb_link_node(&new->nd, parent, p);
2239    rb_insert_color(&new->nd, &sp->root);
2240    pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2241         new->policy ? new->policy->mode : 0);
2242}
2243
2244/* Find shared policy intersecting idx */
2245struct mempolicy *
2246mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2247{
2248    struct mempolicy *pol = NULL;
2249    struct sp_node *sn;
2250
2251    if (!sp->root.rb_node)
2252        return NULL;
2253    spin_lock(&sp->lock);
2254    sn = sp_lookup(sp, idx, idx+1);
2255    if (sn) {
2256        mpol_get(sn->policy);
2257        pol = sn->policy;
2258    }
2259    spin_unlock(&sp->lock);
2260    return pol;
2261}
2262
2263static void sp_free(struct sp_node *n)
2264{
2265    mpol_put(n->policy);
2266    kmem_cache_free(sn_cache, n);
2267}
2268
2269/**
2270 * mpol_misplaced - check whether current page node is valid in policy
2271 *
2272 * @page: page to be checked
2273 * @vma: vm area where page mapped
2274 * @addr: virtual address where page mapped
2275 *
2276 * Lookup current policy node id for vma,addr and "compare to" page's
2277 * node id.
2278 *
2279 * Returns:
2280 * -1 - not misplaced, page is in the right node
2281 * node - node id where the page should be
2282 *
2283 * Policy determination "mimics" alloc_page_vma().
2284 * Called from fault path where we know the vma and faulting address.
2285 */
2286int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2287{
2288    struct mempolicy *pol;
2289    struct zone *zone;
2290    int curnid = page_to_nid(page);
2291    unsigned long pgoff;
2292    int thiscpu = raw_smp_processor_id();
2293    int thisnid = cpu_to_node(thiscpu);
2294    int polnid = -1;
2295    int ret = -1;
2296
2297    BUG_ON(!vma);
2298
2299    pol = get_vma_policy(current, vma, addr);
2300    if (!(pol->flags & MPOL_F_MOF))
2301        goto out;
2302
2303    switch (pol->mode) {
2304    case MPOL_INTERLEAVE:
2305        BUG_ON(addr >= vma->vm_end);
2306        BUG_ON(addr < vma->vm_start);
2307
2308        pgoff = vma->vm_pgoff;
2309        pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2310        polnid = offset_il_node(pol, vma, pgoff);
2311        break;
2312
2313    case MPOL_PREFERRED:
2314        if (pol->flags & MPOL_F_LOCAL)
2315            polnid = numa_node_id();
2316        else
2317            polnid = pol->v.preferred_node;
2318        break;
2319
2320    case MPOL_BIND:
2321        /*
2322         * allows binding to multiple nodes.
2323         * use current page if in policy nodemask,
2324         * else select nearest allowed node, if any.
2325         * If no allowed nodes, use current [!misplaced].
2326         */
2327        if (node_isset(curnid, pol->v.nodes))
2328            goto out;
2329        (void)first_zones_zonelist(
2330                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2331                gfp_zone(GFP_HIGHUSER),
2332                &pol->v.nodes, &zone);
2333        polnid = zone->node;
2334        break;
2335
2336    default:
2337        BUG();
2338    }
2339
2340    /* Migrate the page towards the node whose CPU is referencing it */
2341    if (pol->flags & MPOL_F_MORON) {
2342        polnid = thisnid;
2343
2344        if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2345            goto out;
2346    }
2347
2348    if (curnid != polnid)
2349        ret = polnid;
2350out:
2351    mpol_cond_put(pol);
2352
2353    return ret;
2354}
2355
2356static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2357{
2358    pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2359    rb_erase(&n->nd, &sp->root);
2360    sp_free(n);
2361}
2362
2363static void sp_node_init(struct sp_node *node, unsigned long start,
2364            unsigned long end, struct mempolicy *pol)
2365{
2366    node->start = start;
2367    node->end = end;
2368    node->policy = pol;
2369}
2370
2371static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2372                struct mempolicy *pol)
2373{
2374    struct sp_node *n;
2375    struct mempolicy *newpol;
2376
2377    n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2378    if (!n)
2379        return NULL;
2380
2381    newpol = mpol_dup(pol);
2382    if (IS_ERR(newpol)) {
2383        kmem_cache_free(sn_cache, n);
2384        return NULL;
2385    }
2386    newpol->flags |= MPOL_F_SHARED;
2387    sp_node_init(n, start, end, newpol);
2388
2389    return n;
2390}
2391
2392/* Replace a policy range. */
2393static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2394                 unsigned long end, struct sp_node *new)
2395{
2396    struct sp_node *n;
2397    struct sp_node *n_new = NULL;
2398    struct mempolicy *mpol_new = NULL;
2399    int ret = 0;
2400
2401restart:
2402    spin_lock(&sp->lock);
2403    n = sp_lookup(sp, start, end);
2404    /* Take care of old policies in the same range. */
2405    while (n && n->start < end) {
2406        struct rb_node *next = rb_next(&n->nd);
2407        if (n->start >= start) {
2408            if (n->end <= end)
2409                sp_delete(sp, n);
2410            else
2411                n->start = end;
2412        } else {
2413            /* Old policy spanning whole new range. */
2414            if (n->end > end) {
2415                if (!n_new)
2416                    goto alloc_new;
2417
2418                *mpol_new = *n->policy;
2419                atomic_set(&mpol_new->refcnt, 1);
2420                sp_node_init(n_new, end, n->end, mpol_new);
2421                n->end = start;
2422                sp_insert(sp, n_new);
2423                n_new = NULL;
2424                mpol_new = NULL;
2425                break;
2426            } else
2427                n->end = start;
2428        }
2429        if (!next)
2430            break;
2431        n = rb_entry(next, struct sp_node, nd);
2432    }
2433    if (new)
2434        sp_insert(sp, new);
2435    spin_unlock(&sp->lock);
2436    ret = 0;
2437
2438err_out:
2439    if (mpol_new)
2440        mpol_put(mpol_new);
2441    if (n_new)
2442        kmem_cache_free(sn_cache, n_new);
2443
2444    return ret;
2445
2446alloc_new:
2447    spin_unlock(&sp->lock);
2448    ret = -ENOMEM;
2449    n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2450    if (!n_new)
2451        goto err_out;
2452    mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2453    if (!mpol_new)
2454        goto err_out;
2455    goto restart;
2456}
2457
2458/**
2459 * mpol_shared_policy_init - initialize shared policy for inode
2460 * @sp: pointer to inode shared policy
2461 * @mpol: struct mempolicy to install
2462 *
2463 * Install non-NULL @mpol in inode's shared policy rb-tree.
2464 * On entry, the current task has a reference on a non-NULL @mpol.
2465 * This must be released on exit.
2466 * This is called at get_inode() calls and we can use GFP_KERNEL.
2467 */
2468void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2469{
2470    int ret;
2471
2472    sp->root = RB_ROOT; /* empty tree == default mempolicy */
2473    spin_lock_init(&sp->lock);
2474
2475    if (mpol) {
2476        struct vm_area_struct pvma;
2477        struct mempolicy *new;
2478        NODEMASK_SCRATCH(scratch);
2479
2480        if (!scratch)
2481            goto put_mpol;
2482        /* contextualize the tmpfs mount point mempolicy */
2483        new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2484        if (IS_ERR(new))
2485            goto free_scratch; /* no valid nodemask intersection */
2486
2487        task_lock(current);
2488        ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2489        task_unlock(current);
2490        if (ret)
2491            goto put_new;
2492
2493        /* Create pseudo-vma that contains just the policy */
2494        memset(&pvma, 0, sizeof(struct vm_area_struct));
2495        pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2496        mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2497
2498put_new:
2499        mpol_put(new); /* drop initial ref */
2500free_scratch:
2501        NODEMASK_SCRATCH_FREE(scratch);
2502put_mpol:
2503        mpol_put(mpol); /* drop our incoming ref on sb mpol */
2504    }
2505}
2506
2507int mpol_set_shared_policy(struct shared_policy *info,
2508            struct vm_area_struct *vma, struct mempolicy *npol)
2509{
2510    int err;
2511    struct sp_node *new = NULL;
2512    unsigned long sz = vma_pages(vma);
2513
2514    pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2515         vma->vm_pgoff,
2516         sz, npol ? npol->mode : -1,
2517         npol ? npol->flags : -1,
2518         npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2519
2520    if (npol) {
2521        new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2522        if (!new)
2523            return -ENOMEM;
2524    }
2525    err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2526    if (err && new)
2527        sp_free(new);
2528    return err;
2529}
2530
2531/* Free a backing policy store on inode delete. */
2532void mpol_free_shared_policy(struct shared_policy *p)
2533{
2534    struct sp_node *n;
2535    struct rb_node *next;
2536
2537    if (!p->root.rb_node)
2538        return;
2539    spin_lock(&p->lock);
2540    next = rb_first(&p->root);
2541    while (next) {
2542        n = rb_entry(next, struct sp_node, nd);
2543        next = rb_next(&n->nd);
2544        sp_delete(p, n);
2545    }
2546    spin_unlock(&p->lock);
2547}
2548
2549#ifdef CONFIG_NUMA_BALANCING
2550static int __initdata numabalancing_override;
2551
2552static void __init check_numabalancing_enable(void)
2553{
2554    bool numabalancing_default = false;
2555
2556    if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2557        numabalancing_default = true;
2558
2559    /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2560    if (numabalancing_override)
2561        set_numabalancing_state(numabalancing_override == 1);
2562
2563    if (nr_node_ids > 1 && !numabalancing_override) {
2564        pr_info("%s automatic NUMA balancing. "
2565            "Configure with numa_balancing= or the "
2566            "kernel.numa_balancing sysctl",
2567            numabalancing_default ? "Enabling" : "Disabling");
2568        set_numabalancing_state(numabalancing_default);
2569    }
2570}
2571
2572static int __init setup_numabalancing(char *str)
2573{
2574    int ret = 0;
2575    if (!str)
2576        goto out;
2577
2578    if (!strcmp(str, "enable")) {
2579        numabalancing_override = 1;
2580        ret = 1;
2581    } else if (!strcmp(str, "disable")) {
2582        numabalancing_override = -1;
2583        ret = 1;
2584    }
2585out:
2586    if (!ret)
2587        pr_warn("Unable to parse numa_balancing=\n");
2588
2589    return ret;
2590}
2591__setup("numa_balancing=", setup_numabalancing);
2592#else
2593static inline void __init check_numabalancing_enable(void)
2594{
2595}
2596#endif /* CONFIG_NUMA_BALANCING */
2597
2598/* assumes fs == KERNEL_DS */
2599void __init numa_policy_init(void)
2600{
2601    nodemask_t interleave_nodes;
2602    unsigned long largest = 0;
2603    int nid, prefer = 0;
2604
2605    policy_cache = kmem_cache_create("numa_policy",
2606                     sizeof(struct mempolicy),
2607                     0, SLAB_PANIC, NULL);
2608
2609    sn_cache = kmem_cache_create("shared_policy_node",
2610                     sizeof(struct sp_node),
2611                     0, SLAB_PANIC, NULL);
2612
2613    for_each_node(nid) {
2614        preferred_node_policy[nid] = (struct mempolicy) {
2615            .refcnt = ATOMIC_INIT(1),
2616            .mode = MPOL_PREFERRED,
2617            .flags = MPOL_F_MOF | MPOL_F_MORON,
2618            .v = { .preferred_node = nid, },
2619        };
2620    }
2621
2622    /*
2623     * Set interleaving policy for system init. Interleaving is only
2624     * enabled across suitably sized nodes (default is >= 16MB), or
2625     * fall back to the largest node if they're all smaller.
2626     */
2627    nodes_clear(interleave_nodes);
2628    for_each_node_state(nid, N_MEMORY) {
2629        unsigned long total_pages = node_present_pages(nid);
2630
2631        /* Preserve the largest node */
2632        if (largest < total_pages) {
2633            largest = total_pages;
2634            prefer = nid;
2635        }
2636
2637        /* Interleave this node? */
2638        if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2639            node_set(nid, interleave_nodes);
2640    }
2641
2642    /* All too small, use the largest */
2643    if (unlikely(nodes_empty(interleave_nodes)))
2644        node_set(prefer, interleave_nodes);
2645
2646    if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2647        pr_err("%s: interleaving failed\n", __func__);
2648
2649    check_numabalancing_enable();
2650}
2651
2652/* Reset policy of current process to default */
2653void numa_default_policy(void)
2654{
2655    do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2656}
2657
2658/*
2659 * Parse and format mempolicy from/to strings
2660 */
2661
2662/*
2663 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2664 */
2665static const char * const policy_modes[] =
2666{
2667    [MPOL_DEFAULT] = "default",
2668    [MPOL_PREFERRED] = "prefer",
2669    [MPOL_BIND] = "bind",
2670    [MPOL_INTERLEAVE] = "interleave",
2671    [MPOL_LOCAL] = "local",
2672};
2673
2674
2675#ifdef CONFIG_TMPFS
2676/**
2677 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2678 * @str: string containing mempolicy to parse
2679 * @mpol: pointer to struct mempolicy pointer, returned on success.
2680 *
2681 * Format of input:
2682 * <mode>[=<flags>][:<nodelist>]
2683 *
2684 * On success, returns 0, else 1
2685 */
2686int mpol_parse_str(char *str, struct mempolicy **mpol)
2687{
2688    struct mempolicy *new = NULL;
2689    unsigned short mode;
2690    unsigned short mode_flags;
2691    nodemask_t nodes;
2692    char *nodelist = strchr(str, ':');
2693    char *flags = strchr(str, '=');
2694    int err = 1;
2695
2696    if (nodelist) {
2697        /* NUL-terminate mode or flags string */
2698        *nodelist++ = '\0';
2699        if (nodelist_parse(nodelist, nodes))
2700            goto out;
2701        if (!nodes_subset(nodes, node_states[N_MEMORY]))
2702            goto out;
2703    } else
2704        nodes_clear(nodes);
2705
2706    if (flags)
2707        *flags++ = '\0'; /* terminate mode string */
2708
2709    for (mode = 0; mode < MPOL_MAX; mode++) {
2710        if (!strcmp(str, policy_modes[mode])) {
2711            break;
2712        }
2713    }
2714    if (mode >= MPOL_MAX)
2715        goto out;
2716
2717    switch (mode) {
2718    case MPOL_PREFERRED:
2719        /*
2720         * Insist on a nodelist of one node only
2721         */
2722        if (nodelist) {
2723            char *rest = nodelist;
2724            while (isdigit(*rest))
2725                rest++;
2726            if (*rest)
2727                goto out;
2728        }
2729        break;
2730    case MPOL_INTERLEAVE:
2731        /*
2732         * Default to online nodes with memory if no nodelist
2733         */
2734        if (!nodelist)
2735            nodes = node_states[N_MEMORY];
2736        break;
2737    case MPOL_LOCAL:
2738        /*
2739         * Don't allow a nodelist; mpol_new() checks flags
2740         */
2741        if (nodelist)
2742            goto out;
2743        mode = MPOL_PREFERRED;
2744        break;
2745    case MPOL_DEFAULT:
2746        /*
2747         * Insist on a empty nodelist
2748         */
2749        if (!nodelist)
2750            err = 0;
2751        goto out;
2752    case MPOL_BIND:
2753        /*
2754         * Insist on a nodelist
2755         */
2756        if (!nodelist)
2757            goto out;
2758    }
2759
2760    mode_flags = 0;
2761    if (flags) {
2762        /*
2763         * Currently, we only support two mutually exclusive
2764         * mode flags.
2765         */
2766        if (!strcmp(flags, "static"))
2767            mode_flags |= MPOL_F_STATIC_NODES;
2768        else if (!strcmp(flags, "relative"))
2769            mode_flags |= MPOL_F_RELATIVE_NODES;
2770        else
2771            goto out;
2772    }
2773
2774    new = mpol_new(mode, mode_flags, &nodes);
2775    if (IS_ERR(new))
2776        goto out;
2777
2778    /*
2779     * Save nodes for mpol_to_str() to show the tmpfs mount options
2780     * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2781     */
2782    if (mode != MPOL_PREFERRED)
2783        new->v.nodes = nodes;
2784    else if (nodelist)
2785        new->v.preferred_node = first_node(nodes);
2786    else
2787        new->flags |= MPOL_F_LOCAL;
2788
2789    /*
2790     * Save nodes for contextualization: this will be used to "clone"
2791     * the mempolicy in a specific context [cpuset] at a later time.
2792     */
2793    new->w.user_nodemask = nodes;
2794
2795    err = 0;
2796
2797out:
2798    /* Restore string for error message */
2799    if (nodelist)
2800        *--nodelist = ':';
2801    if (flags)
2802        *--flags = '=';
2803    if (!err)
2804        *mpol = new;
2805    return err;
2806}
2807#endif /* CONFIG_TMPFS */
2808
2809/**
2810 * mpol_to_str - format a mempolicy structure for printing
2811 * @buffer: to contain formatted mempolicy string
2812 * @maxlen: length of @buffer
2813 * @pol: pointer to mempolicy to be formatted
2814 *
2815 * Convert @pol into a string. If @buffer is too short, truncate the string.
2816 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2817 * longest flag, "relative", and to display at least a few node ids.
2818 */
2819void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2820{
2821    char *p = buffer;
2822    nodemask_t nodes = NODE_MASK_NONE;
2823    unsigned short mode = MPOL_DEFAULT;
2824    unsigned short flags = 0;
2825
2826    if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2827        mode = pol->mode;
2828        flags = pol->flags;
2829    }
2830
2831    switch (mode) {
2832    case MPOL_DEFAULT:
2833        break;
2834    case MPOL_PREFERRED:
2835        if (flags & MPOL_F_LOCAL)
2836            mode = MPOL_LOCAL;
2837        else
2838            node_set(pol->v.preferred_node, nodes);
2839        break;
2840    case MPOL_BIND:
2841    case MPOL_INTERLEAVE:
2842        nodes = pol->v.nodes;
2843        break;
2844    default:
2845        WARN_ON_ONCE(1);
2846        snprintf(p, maxlen, "unknown");
2847        return;
2848    }
2849
2850    p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2851
2852    if (flags & MPOL_MODE_FLAGS) {
2853        p += snprintf(p, buffer + maxlen - p, "=");
2854
2855        /*
2856         * Currently, the only defined flags are mutually exclusive
2857         */
2858        if (flags & MPOL_F_STATIC_NODES)
2859            p += snprintf(p, buffer + maxlen - p, "static");
2860        else if (flags & MPOL_F_RELATIVE_NODES)
2861            p += snprintf(p, buffer + maxlen - p, "relative");
2862    }
2863
2864    if (!nodes_empty(nodes)) {
2865        p += snprintf(p, buffer + maxlen - p, ":");
2866         p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2867    }
2868}
2869

Archive Download this file



interactive