Root/mm/mempolicy.c

1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/slab.h>
77#include <linux/string.h>
78#include <linux/export.h>
79#include <linux/nsproxy.h>
80#include <linux/interrupt.h>
81#include <linux/init.h>
82#include <linux/compat.h>
83#include <linux/swap.h>
84#include <linux/seq_file.h>
85#include <linux/proc_fs.h>
86#include <linux/migrate.h>
87#include <linux/ksm.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
94
95#include <asm/tlbflush.h>
96#include <asm/uaccess.h>
97#include <linux/random.h>
98
99#include "internal.h"
100
101/* Internal flags */
102#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
103#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
104
105static struct kmem_cache *policy_cache;
106static struct kmem_cache *sn_cache;
107
108/* Highest zone. An specific allocation for a zone below that is not
109   policied. */
110enum zone_type policy_zone = 0;
111
112/*
113 * run-time system-wide default policy => local allocation
114 */
115static struct mempolicy default_policy = {
116    .refcnt = ATOMIC_INIT(1), /* never free it */
117    .mode = MPOL_PREFERRED,
118    .flags = MPOL_F_LOCAL,
119};
120
121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122
123static struct mempolicy *get_task_policy(struct task_struct *p)
124{
125    struct mempolicy *pol = p->mempolicy;
126    int node;
127
128    if (!pol) {
129        node = numa_node_id();
130        if (node != NUMA_NO_NODE)
131            pol = &preferred_node_policy[node];
132
133        /* preferred_node_policy is not initialised early in boot */
134        if (!pol->mode)
135            pol = NULL;
136    }
137
138    return pol;
139}
140
141static const struct mempolicy_operations {
142    int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
143    /*
144     * If read-side task has no lock to protect task->mempolicy, write-side
145     * task will rebind the task->mempolicy by two step. The first step is
146     * setting all the newly nodes, and the second step is cleaning all the
147     * disallowed nodes. In this way, we can avoid finding no node to alloc
148     * page.
149     * If we have a lock to protect task->mempolicy in read-side, we do
150     * rebind directly.
151     *
152     * step:
153     * MPOL_REBIND_ONCE - do rebind work at once
154     * MPOL_REBIND_STEP1 - set all the newly nodes
155     * MPOL_REBIND_STEP2 - clean all the disallowed nodes
156     */
157    void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
158            enum mpol_rebind_step step);
159} mpol_ops[MPOL_MAX];
160
161/* Check that the nodemask contains at least one populated zone */
162static int is_valid_nodemask(const nodemask_t *nodemask)
163{
164    return nodes_intersects(*nodemask, node_states[N_MEMORY]);
165}
166
167static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
168{
169    return pol->flags & MPOL_MODE_FLAGS;
170}
171
172static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
173                   const nodemask_t *rel)
174{
175    nodemask_t tmp;
176    nodes_fold(tmp, *orig, nodes_weight(*rel));
177    nodes_onto(*ret, tmp, *rel);
178}
179
180static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
181{
182    if (nodes_empty(*nodes))
183        return -EINVAL;
184    pol->v.nodes = *nodes;
185    return 0;
186}
187
188static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
189{
190    if (!nodes)
191        pol->flags |= MPOL_F_LOCAL; /* local allocation */
192    else if (nodes_empty(*nodes))
193        return -EINVAL; /* no allowed nodes */
194    else
195        pol->v.preferred_node = first_node(*nodes);
196    return 0;
197}
198
199static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
200{
201    if (!is_valid_nodemask(nodes))
202        return -EINVAL;
203    pol->v.nodes = *nodes;
204    return 0;
205}
206
207/*
208 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
209 * any, for the new policy. mpol_new() has already validated the nodes
210 * parameter with respect to the policy mode and flags. But, we need to
211 * handle an empty nodemask with MPOL_PREFERRED here.
212 *
213 * Must be called holding task's alloc_lock to protect task's mems_allowed
214 * and mempolicy. May also be called holding the mmap_semaphore for write.
215 */
216static int mpol_set_nodemask(struct mempolicy *pol,
217             const nodemask_t *nodes, struct nodemask_scratch *nsc)
218{
219    int ret;
220
221    /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
222    if (pol == NULL)
223        return 0;
224    /* Check N_MEMORY */
225    nodes_and(nsc->mask1,
226          cpuset_current_mems_allowed, node_states[N_MEMORY]);
227
228    VM_BUG_ON(!nodes);
229    if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
230        nodes = NULL; /* explicit local allocation */
231    else {
232        if (pol->flags & MPOL_F_RELATIVE_NODES)
233            mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
234        else
235            nodes_and(nsc->mask2, *nodes, nsc->mask1);
236
237        if (mpol_store_user_nodemask(pol))
238            pol->w.user_nodemask = *nodes;
239        else
240            pol->w.cpuset_mems_allowed =
241                        cpuset_current_mems_allowed;
242    }
243
244    if (nodes)
245        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
246    else
247        ret = mpol_ops[pol->mode].create(pol, NULL);
248    return ret;
249}
250
251/*
252 * This function just creates a new policy, does some check and simple
253 * initialization. You must invoke mpol_set_nodemask() to set nodes.
254 */
255static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
256                  nodemask_t *nodes)
257{
258    struct mempolicy *policy;
259
260    pr_debug("setting mode %d flags %d nodes[0] %lx\n",
261         mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
262
263    if (mode == MPOL_DEFAULT) {
264        if (nodes && !nodes_empty(*nodes))
265            return ERR_PTR(-EINVAL);
266        return NULL;
267    }
268    VM_BUG_ON(!nodes);
269
270    /*
271     * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
272     * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
273     * All other modes require a valid pointer to a non-empty nodemask.
274     */
275    if (mode == MPOL_PREFERRED) {
276        if (nodes_empty(*nodes)) {
277            if (((flags & MPOL_F_STATIC_NODES) ||
278                 (flags & MPOL_F_RELATIVE_NODES)))
279                return ERR_PTR(-EINVAL);
280        }
281    } else if (mode == MPOL_LOCAL) {
282        if (!nodes_empty(*nodes))
283            return ERR_PTR(-EINVAL);
284        mode = MPOL_PREFERRED;
285    } else if (nodes_empty(*nodes))
286        return ERR_PTR(-EINVAL);
287    policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
288    if (!policy)
289        return ERR_PTR(-ENOMEM);
290    atomic_set(&policy->refcnt, 1);
291    policy->mode = mode;
292    policy->flags = flags;
293
294    return policy;
295}
296
297/* Slow path of a mpol destructor. */
298void __mpol_put(struct mempolicy *p)
299{
300    if (!atomic_dec_and_test(&p->refcnt))
301        return;
302    kmem_cache_free(policy_cache, p);
303}
304
305static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
306                enum mpol_rebind_step step)
307{
308}
309
310/*
311 * step:
312 * MPOL_REBIND_ONCE - do rebind work at once
313 * MPOL_REBIND_STEP1 - set all the newly nodes
314 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
315 */
316static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
317                 enum mpol_rebind_step step)
318{
319    nodemask_t tmp;
320
321    if (pol->flags & MPOL_F_STATIC_NODES)
322        nodes_and(tmp, pol->w.user_nodemask, *nodes);
323    else if (pol->flags & MPOL_F_RELATIVE_NODES)
324        mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
325    else {
326        /*
327         * if step == 1, we use ->w.cpuset_mems_allowed to cache the
328         * result
329         */
330        if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
331            nodes_remap(tmp, pol->v.nodes,
332                    pol->w.cpuset_mems_allowed, *nodes);
333            pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
334        } else if (step == MPOL_REBIND_STEP2) {
335            tmp = pol->w.cpuset_mems_allowed;
336            pol->w.cpuset_mems_allowed = *nodes;
337        } else
338            BUG();
339    }
340
341    if (nodes_empty(tmp))
342        tmp = *nodes;
343
344    if (step == MPOL_REBIND_STEP1)
345        nodes_or(pol->v.nodes, pol->v.nodes, tmp);
346    else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
347        pol->v.nodes = tmp;
348    else
349        BUG();
350
351    if (!node_isset(current->il_next, tmp)) {
352        current->il_next = next_node(current->il_next, tmp);
353        if (current->il_next >= MAX_NUMNODES)
354            current->il_next = first_node(tmp);
355        if (current->il_next >= MAX_NUMNODES)
356            current->il_next = numa_node_id();
357    }
358}
359
360static void mpol_rebind_preferred(struct mempolicy *pol,
361                  const nodemask_t *nodes,
362                  enum mpol_rebind_step step)
363{
364    nodemask_t tmp;
365
366    if (pol->flags & MPOL_F_STATIC_NODES) {
367        int node = first_node(pol->w.user_nodemask);
368
369        if (node_isset(node, *nodes)) {
370            pol->v.preferred_node = node;
371            pol->flags &= ~MPOL_F_LOCAL;
372        } else
373            pol->flags |= MPOL_F_LOCAL;
374    } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
375        mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
376        pol->v.preferred_node = first_node(tmp);
377    } else if (!(pol->flags & MPOL_F_LOCAL)) {
378        pol->v.preferred_node = node_remap(pol->v.preferred_node,
379                           pol->w.cpuset_mems_allowed,
380                           *nodes);
381        pol->w.cpuset_mems_allowed = *nodes;
382    }
383}
384
385/*
386 * mpol_rebind_policy - Migrate a policy to a different set of nodes
387 *
388 * If read-side task has no lock to protect task->mempolicy, write-side
389 * task will rebind the task->mempolicy by two step. The first step is
390 * setting all the newly nodes, and the second step is cleaning all the
391 * disallowed nodes. In this way, we can avoid finding no node to alloc
392 * page.
393 * If we have a lock to protect task->mempolicy in read-side, we do
394 * rebind directly.
395 *
396 * step:
397 * MPOL_REBIND_ONCE - do rebind work at once
398 * MPOL_REBIND_STEP1 - set all the newly nodes
399 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
400 */
401static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
402                enum mpol_rebind_step step)
403{
404    if (!pol)
405        return;
406    if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
407        nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
408        return;
409
410    if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
411        return;
412
413    if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
414        BUG();
415
416    if (step == MPOL_REBIND_STEP1)
417        pol->flags |= MPOL_F_REBINDING;
418    else if (step == MPOL_REBIND_STEP2)
419        pol->flags &= ~MPOL_F_REBINDING;
420    else if (step >= MPOL_REBIND_NSTEP)
421        BUG();
422
423    mpol_ops[pol->mode].rebind(pol, newmask, step);
424}
425
426/*
427 * Wrapper for mpol_rebind_policy() that just requires task
428 * pointer, and updates task mempolicy.
429 *
430 * Called with task's alloc_lock held.
431 */
432
433void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
434            enum mpol_rebind_step step)
435{
436    mpol_rebind_policy(tsk->mempolicy, new, step);
437}
438
439/*
440 * Rebind each vma in mm to new nodemask.
441 *
442 * Call holding a reference to mm. Takes mm->mmap_sem during call.
443 */
444
445void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
446{
447    struct vm_area_struct *vma;
448
449    down_write(&mm->mmap_sem);
450    for (vma = mm->mmap; vma; vma = vma->vm_next)
451        mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
452    up_write(&mm->mmap_sem);
453}
454
455static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
456    [MPOL_DEFAULT] = {
457        .rebind = mpol_rebind_default,
458    },
459    [MPOL_INTERLEAVE] = {
460        .create = mpol_new_interleave,
461        .rebind = mpol_rebind_nodemask,
462    },
463    [MPOL_PREFERRED] = {
464        .create = mpol_new_preferred,
465        .rebind = mpol_rebind_preferred,
466    },
467    [MPOL_BIND] = {
468        .create = mpol_new_bind,
469        .rebind = mpol_rebind_nodemask,
470    },
471};
472
473static void migrate_page_add(struct page *page, struct list_head *pagelist,
474                unsigned long flags);
475
476/* Scan through pages checking if pages follow certain conditions. */
477static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
478        unsigned long addr, unsigned long end,
479        const nodemask_t *nodes, unsigned long flags,
480        void *private)
481{
482    pte_t *orig_pte;
483    pte_t *pte;
484    spinlock_t *ptl;
485
486    orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
487    do {
488        struct page *page;
489        int nid;
490
491        if (!pte_present(*pte))
492            continue;
493        page = vm_normal_page(vma, addr, *pte);
494        if (!page)
495            continue;
496        /*
497         * vm_normal_page() filters out zero pages, but there might
498         * still be PageReserved pages to skip, perhaps in a VDSO.
499         */
500        if (PageReserved(page))
501            continue;
502        nid = page_to_nid(page);
503        if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
504            continue;
505
506        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
507            migrate_page_add(page, private, flags);
508        else
509            break;
510    } while (pte++, addr += PAGE_SIZE, addr != end);
511    pte_unmap_unlock(orig_pte, ptl);
512    return addr != end;
513}
514
515static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
516        unsigned long addr, unsigned long end,
517        const nodemask_t *nodes, unsigned long flags,
518        void *private)
519{
520    pmd_t *pmd;
521    unsigned long next;
522
523    pmd = pmd_offset(pud, addr);
524    do {
525        next = pmd_addr_end(addr, end);
526        split_huge_page_pmd(vma, addr, pmd);
527        if (pmd_none_or_trans_huge_or_clear_bad(pmd))
528            continue;
529        if (check_pte_range(vma, pmd, addr, next, nodes,
530                    flags, private))
531            return -EIO;
532    } while (pmd++, addr = next, addr != end);
533    return 0;
534}
535
536static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
537        unsigned long addr, unsigned long end,
538        const nodemask_t *nodes, unsigned long flags,
539        void *private)
540{
541    pud_t *pud;
542    unsigned long next;
543
544    pud = pud_offset(pgd, addr);
545    do {
546        next = pud_addr_end(addr, end);
547        if (pud_none_or_clear_bad(pud))
548            continue;
549        if (check_pmd_range(vma, pud, addr, next, nodes,
550                    flags, private))
551            return -EIO;
552    } while (pud++, addr = next, addr != end);
553    return 0;
554}
555
556static inline int check_pgd_range(struct vm_area_struct *vma,
557        unsigned long addr, unsigned long end,
558        const nodemask_t *nodes, unsigned long flags,
559        void *private)
560{
561    pgd_t *pgd;
562    unsigned long next;
563
564    pgd = pgd_offset(vma->vm_mm, addr);
565    do {
566        next = pgd_addr_end(addr, end);
567        if (pgd_none_or_clear_bad(pgd))
568            continue;
569        if (check_pud_range(vma, pgd, addr, next, nodes,
570                    flags, private))
571            return -EIO;
572    } while (pgd++, addr = next, addr != end);
573    return 0;
574}
575
576#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
577/*
578 * This is used to mark a range of virtual addresses to be inaccessible.
579 * These are later cleared by a NUMA hinting fault. Depending on these
580 * faults, pages may be migrated for better NUMA placement.
581 *
582 * This is assuming that NUMA faults are handled using PROT_NONE. If
583 * an architecture makes a different choice, it will need further
584 * changes to the core.
585 */
586unsigned long change_prot_numa(struct vm_area_struct *vma,
587            unsigned long addr, unsigned long end)
588{
589    int nr_updated;
590    BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
591
592    nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
593    if (nr_updated)
594        count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
595
596    return nr_updated;
597}
598#else
599static unsigned long change_prot_numa(struct vm_area_struct *vma,
600            unsigned long addr, unsigned long end)
601{
602    return 0;
603}
604#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
605
606/*
607 * Check if all pages in a range are on a set of nodes.
608 * If pagelist != NULL then isolate pages from the LRU and
609 * put them on the pagelist.
610 */
611static struct vm_area_struct *
612check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
613        const nodemask_t *nodes, unsigned long flags, void *private)
614{
615    int err;
616    struct vm_area_struct *first, *vma, *prev;
617
618
619    first = find_vma(mm, start);
620    if (!first)
621        return ERR_PTR(-EFAULT);
622    prev = NULL;
623    for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
624        unsigned long endvma = vma->vm_end;
625
626        if (endvma > end)
627            endvma = end;
628        if (vma->vm_start > start)
629            start = vma->vm_start;
630
631        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
632            if (!vma->vm_next && vma->vm_end < end)
633                return ERR_PTR(-EFAULT);
634            if (prev && prev->vm_end < vma->vm_start)
635                return ERR_PTR(-EFAULT);
636        }
637
638        if (is_vm_hugetlb_page(vma))
639            goto next;
640
641        if (flags & MPOL_MF_LAZY) {
642            change_prot_numa(vma, start, endvma);
643            goto next;
644        }
645
646        if ((flags & MPOL_MF_STRICT) ||
647             ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
648              vma_migratable(vma))) {
649
650            err = check_pgd_range(vma, start, endvma, nodes,
651                        flags, private);
652            if (err) {
653                first = ERR_PTR(err);
654                break;
655            }
656        }
657next:
658        prev = vma;
659    }
660    return first;
661}
662
663/*
664 * Apply policy to a single VMA
665 * This must be called with the mmap_sem held for writing.
666 */
667static int vma_replace_policy(struct vm_area_struct *vma,
668                        struct mempolicy *pol)
669{
670    int err;
671    struct mempolicy *old;
672    struct mempolicy *new;
673
674    pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
675         vma->vm_start, vma->vm_end, vma->vm_pgoff,
676         vma->vm_ops, vma->vm_file,
677         vma->vm_ops ? vma->vm_ops->set_policy : NULL);
678
679    new = mpol_dup(pol);
680    if (IS_ERR(new))
681        return PTR_ERR(new);
682
683    if (vma->vm_ops && vma->vm_ops->set_policy) {
684        err = vma->vm_ops->set_policy(vma, new);
685        if (err)
686            goto err_out;
687    }
688
689    old = vma->vm_policy;
690    vma->vm_policy = new; /* protected by mmap_sem */
691    mpol_put(old);
692
693    return 0;
694 err_out:
695    mpol_put(new);
696    return err;
697}
698
699/* Step 2: apply policy to a range and do splits. */
700static int mbind_range(struct mm_struct *mm, unsigned long start,
701               unsigned long end, struct mempolicy *new_pol)
702{
703    struct vm_area_struct *next;
704    struct vm_area_struct *prev;
705    struct vm_area_struct *vma;
706    int err = 0;
707    pgoff_t pgoff;
708    unsigned long vmstart;
709    unsigned long vmend;
710
711    vma = find_vma(mm, start);
712    if (!vma || vma->vm_start > start)
713        return -EFAULT;
714
715    prev = vma->vm_prev;
716    if (start > vma->vm_start)
717        prev = vma;
718
719    for (; vma && vma->vm_start < end; prev = vma, vma = next) {
720        next = vma->vm_next;
721        vmstart = max(start, vma->vm_start);
722        vmend = min(end, vma->vm_end);
723
724        if (mpol_equal(vma_policy(vma), new_pol))
725            continue;
726
727        pgoff = vma->vm_pgoff +
728            ((vmstart - vma->vm_start) >> PAGE_SHIFT);
729        prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
730                  vma->anon_vma, vma->vm_file, pgoff,
731                  new_pol);
732        if (prev) {
733            vma = prev;
734            next = vma->vm_next;
735            continue;
736        }
737        if (vma->vm_start != vmstart) {
738            err = split_vma(vma->vm_mm, vma, vmstart, 1);
739            if (err)
740                goto out;
741        }
742        if (vma->vm_end != vmend) {
743            err = split_vma(vma->vm_mm, vma, vmend, 0);
744            if (err)
745                goto out;
746        }
747        err = vma_replace_policy(vma, new_pol);
748        if (err)
749            goto out;
750    }
751
752 out:
753    return err;
754}
755
756/*
757 * Update task->flags PF_MEMPOLICY bit: set iff non-default
758 * mempolicy. Allows more rapid checking of this (combined perhaps
759 * with other PF_* flag bits) on memory allocation hot code paths.
760 *
761 * If called from outside this file, the task 'p' should -only- be
762 * a newly forked child not yet visible on the task list, because
763 * manipulating the task flags of a visible task is not safe.
764 *
765 * The above limitation is why this routine has the funny name
766 * mpol_fix_fork_child_flag().
767 *
768 * It is also safe to call this with a task pointer of current,
769 * which the static wrapper mpol_set_task_struct_flag() does,
770 * for use within this file.
771 */
772
773void mpol_fix_fork_child_flag(struct task_struct *p)
774{
775    if (p->mempolicy)
776        p->flags |= PF_MEMPOLICY;
777    else
778        p->flags &= ~PF_MEMPOLICY;
779}
780
781static void mpol_set_task_struct_flag(void)
782{
783    mpol_fix_fork_child_flag(current);
784}
785
786/* Set the process memory policy */
787static long do_set_mempolicy(unsigned short mode, unsigned short flags,
788                 nodemask_t *nodes)
789{
790    struct mempolicy *new, *old;
791    struct mm_struct *mm = current->mm;
792    NODEMASK_SCRATCH(scratch);
793    int ret;
794
795    if (!scratch)
796        return -ENOMEM;
797
798    new = mpol_new(mode, flags, nodes);
799    if (IS_ERR(new)) {
800        ret = PTR_ERR(new);
801        goto out;
802    }
803    /*
804     * prevent changing our mempolicy while show_numa_maps()
805     * is using it.
806     * Note: do_set_mempolicy() can be called at init time
807     * with no 'mm'.
808     */
809    if (mm)
810        down_write(&mm->mmap_sem);
811    task_lock(current);
812    ret = mpol_set_nodemask(new, nodes, scratch);
813    if (ret) {
814        task_unlock(current);
815        if (mm)
816            up_write(&mm->mmap_sem);
817        mpol_put(new);
818        goto out;
819    }
820    old = current->mempolicy;
821    current->mempolicy = new;
822    mpol_set_task_struct_flag();
823    if (new && new->mode == MPOL_INTERLEAVE &&
824        nodes_weight(new->v.nodes))
825        current->il_next = first_node(new->v.nodes);
826    task_unlock(current);
827    if (mm)
828        up_write(&mm->mmap_sem);
829
830    mpol_put(old);
831    ret = 0;
832out:
833    NODEMASK_SCRATCH_FREE(scratch);
834    return ret;
835}
836
837/*
838 * Return nodemask for policy for get_mempolicy() query
839 *
840 * Called with task's alloc_lock held
841 */
842static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
843{
844    nodes_clear(*nodes);
845    if (p == &default_policy)
846        return;
847
848    switch (p->mode) {
849    case MPOL_BIND:
850        /* Fall through */
851    case MPOL_INTERLEAVE:
852        *nodes = p->v.nodes;
853        break;
854    case MPOL_PREFERRED:
855        if (!(p->flags & MPOL_F_LOCAL))
856            node_set(p->v.preferred_node, *nodes);
857        /* else return empty node mask for local allocation */
858        break;
859    default:
860        BUG();
861    }
862}
863
864static int lookup_node(struct mm_struct *mm, unsigned long addr)
865{
866    struct page *p;
867    int err;
868
869    err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
870    if (err >= 0) {
871        err = page_to_nid(p);
872        put_page(p);
873    }
874    return err;
875}
876
877/* Retrieve NUMA policy */
878static long do_get_mempolicy(int *policy, nodemask_t *nmask,
879                 unsigned long addr, unsigned long flags)
880{
881    int err;
882    struct mm_struct *mm = current->mm;
883    struct vm_area_struct *vma = NULL;
884    struct mempolicy *pol = current->mempolicy;
885
886    if (flags &
887        ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
888        return -EINVAL;
889
890    if (flags & MPOL_F_MEMS_ALLOWED) {
891        if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
892            return -EINVAL;
893        *policy = 0; /* just so it's initialized */
894        task_lock(current);
895        *nmask = cpuset_current_mems_allowed;
896        task_unlock(current);
897        return 0;
898    }
899
900    if (flags & MPOL_F_ADDR) {
901        /*
902         * Do NOT fall back to task policy if the
903         * vma/shared policy at addr is NULL. We
904         * want to return MPOL_DEFAULT in this case.
905         */
906        down_read(&mm->mmap_sem);
907        vma = find_vma_intersection(mm, addr, addr+1);
908        if (!vma) {
909            up_read(&mm->mmap_sem);
910            return -EFAULT;
911        }
912        if (vma->vm_ops && vma->vm_ops->get_policy)
913            pol = vma->vm_ops->get_policy(vma, addr);
914        else
915            pol = vma->vm_policy;
916    } else if (addr)
917        return -EINVAL;
918
919    if (!pol)
920        pol = &default_policy; /* indicates default behavior */
921
922    if (flags & MPOL_F_NODE) {
923        if (flags & MPOL_F_ADDR) {
924            err = lookup_node(mm, addr);
925            if (err < 0)
926                goto out;
927            *policy = err;
928        } else if (pol == current->mempolicy &&
929                pol->mode == MPOL_INTERLEAVE) {
930            *policy = current->il_next;
931        } else {
932            err = -EINVAL;
933            goto out;
934        }
935    } else {
936        *policy = pol == &default_policy ? MPOL_DEFAULT :
937                        pol->mode;
938        /*
939         * Internal mempolicy flags must be masked off before exposing
940         * the policy to userspace.
941         */
942        *policy |= (pol->flags & MPOL_MODE_FLAGS);
943    }
944
945    if (vma) {
946        up_read(&current->mm->mmap_sem);
947        vma = NULL;
948    }
949
950    err = 0;
951    if (nmask) {
952        if (mpol_store_user_nodemask(pol)) {
953            *nmask = pol->w.user_nodemask;
954        } else {
955            task_lock(current);
956            get_policy_nodemask(pol, nmask);
957            task_unlock(current);
958        }
959    }
960
961 out:
962    mpol_cond_put(pol);
963    if (vma)
964        up_read(&current->mm->mmap_sem);
965    return err;
966}
967
968#ifdef CONFIG_MIGRATION
969/*
970 * page migration
971 */
972static void migrate_page_add(struct page *page, struct list_head *pagelist,
973                unsigned long flags)
974{
975    /*
976     * Avoid migrating a page that is shared with others.
977     */
978    if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
979        if (!isolate_lru_page(page)) {
980            list_add_tail(&page->lru, pagelist);
981            inc_zone_page_state(page, NR_ISOLATED_ANON +
982                        page_is_file_cache(page));
983        }
984    }
985}
986
987static struct page *new_node_page(struct page *page, unsigned long node, int **x)
988{
989    return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
990}
991
992/*
993 * Migrate pages from one node to a target node.
994 * Returns error or the number of pages not migrated.
995 */
996static int migrate_to_node(struct mm_struct *mm, int source, int dest,
997               int flags)
998{
999    nodemask_t nmask;
1000    LIST_HEAD(pagelist);
1001    int err = 0;
1002
1003    nodes_clear(nmask);
1004    node_set(source, nmask);
1005
1006    /*
1007     * This does not "check" the range but isolates all pages that
1008     * need migration. Between passing in the full user address
1009     * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1010     */
1011    VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1012    check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1013            flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1014
1015    if (!list_empty(&pagelist)) {
1016        err = migrate_pages(&pagelist, new_node_page, dest,
1017                    MIGRATE_SYNC, MR_SYSCALL);
1018        if (err)
1019            putback_lru_pages(&pagelist);
1020    }
1021
1022    return err;
1023}
1024
1025/*
1026 * Move pages between the two nodesets so as to preserve the physical
1027 * layout as much as possible.
1028 *
1029 * Returns the number of page that could not be moved.
1030 */
1031int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1032             const nodemask_t *to, int flags)
1033{
1034    int busy = 0;
1035    int err;
1036    nodemask_t tmp;
1037
1038    err = migrate_prep();
1039    if (err)
1040        return err;
1041
1042    down_read(&mm->mmap_sem);
1043
1044    err = migrate_vmas(mm, from, to, flags);
1045    if (err)
1046        goto out;
1047
1048    /*
1049     * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1050     * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1051     * bit in 'tmp', and return that <source, dest> pair for migration.
1052     * The pair of nodemasks 'to' and 'from' define the map.
1053     *
1054     * If no pair of bits is found that way, fallback to picking some
1055     * pair of 'source' and 'dest' bits that are not the same. If the
1056     * 'source' and 'dest' bits are the same, this represents a node
1057     * that will be migrating to itself, so no pages need move.
1058     *
1059     * If no bits are left in 'tmp', or if all remaining bits left
1060     * in 'tmp' correspond to the same bit in 'to', return false
1061     * (nothing left to migrate).
1062     *
1063     * This lets us pick a pair of nodes to migrate between, such that
1064     * if possible the dest node is not already occupied by some other
1065     * source node, minimizing the risk of overloading the memory on a
1066     * node that would happen if we migrated incoming memory to a node
1067     * before migrating outgoing memory source that same node.
1068     *
1069     * A single scan of tmp is sufficient. As we go, we remember the
1070     * most recent <s, d> pair that moved (s != d). If we find a pair
1071     * that not only moved, but what's better, moved to an empty slot
1072     * (d is not set in tmp), then we break out then, with that pair.
1073     * Otherwise when we finish scanning from_tmp, we at least have the
1074     * most recent <s, d> pair that moved. If we get all the way through
1075     * the scan of tmp without finding any node that moved, much less
1076     * moved to an empty node, then there is nothing left worth migrating.
1077     */
1078
1079    tmp = *from;
1080    while (!nodes_empty(tmp)) {
1081        int s,d;
1082        int source = -1;
1083        int dest = 0;
1084
1085        for_each_node_mask(s, tmp) {
1086
1087            /*
1088             * do_migrate_pages() tries to maintain the relative
1089             * node relationship of the pages established between
1090             * threads and memory areas.
1091                         *
1092             * However if the number of source nodes is not equal to
1093             * the number of destination nodes we can not preserve
1094             * this node relative relationship. In that case, skip
1095             * copying memory from a node that is in the destination
1096             * mask.
1097             *
1098             * Example: [2,3,4] -> [3,4,5] moves everything.
1099             * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1100             */
1101
1102            if ((nodes_weight(*from) != nodes_weight(*to)) &&
1103                        (node_isset(s, *to)))
1104                continue;
1105
1106            d = node_remap(s, *from, *to);
1107            if (s == d)
1108                continue;
1109
1110            source = s; /* Node moved. Memorize */
1111            dest = d;
1112
1113            /* dest not in remaining from nodes? */
1114            if (!node_isset(dest, tmp))
1115                break;
1116        }
1117        if (source == -1)
1118            break;
1119
1120        node_clear(source, tmp);
1121        err = migrate_to_node(mm, source, dest, flags);
1122        if (err > 0)
1123            busy += err;
1124        if (err < 0)
1125            break;
1126    }
1127out:
1128    up_read(&mm->mmap_sem);
1129    if (err < 0)
1130        return err;
1131    return busy;
1132
1133}
1134
1135/*
1136 * Allocate a new page for page migration based on vma policy.
1137 * Start assuming that page is mapped by vma pointed to by @private.
1138 * Search forward from there, if not. N.B., this assumes that the
1139 * list of pages handed to migrate_pages()--which is how we get here--
1140 * is in virtual address order.
1141 */
1142static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1143{
1144    struct vm_area_struct *vma = (struct vm_area_struct *)private;
1145    unsigned long uninitialized_var(address);
1146
1147    while (vma) {
1148        address = page_address_in_vma(page, vma);
1149        if (address != -EFAULT)
1150            break;
1151        vma = vma->vm_next;
1152    }
1153
1154    /*
1155     * if !vma, alloc_page_vma() will use task or system default policy
1156     */
1157    return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1158}
1159#else
1160
1161static void migrate_page_add(struct page *page, struct list_head *pagelist,
1162                unsigned long flags)
1163{
1164}
1165
1166int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1167             const nodemask_t *to, int flags)
1168{
1169    return -ENOSYS;
1170}
1171
1172static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1173{
1174    return NULL;
1175}
1176#endif
1177
1178static long do_mbind(unsigned long start, unsigned long len,
1179             unsigned short mode, unsigned short mode_flags,
1180             nodemask_t *nmask, unsigned long flags)
1181{
1182    struct vm_area_struct *vma;
1183    struct mm_struct *mm = current->mm;
1184    struct mempolicy *new;
1185    unsigned long end;
1186    int err;
1187    LIST_HEAD(pagelist);
1188
1189    if (flags & ~(unsigned long)MPOL_MF_VALID)
1190        return -EINVAL;
1191    if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1192        return -EPERM;
1193
1194    if (start & ~PAGE_MASK)
1195        return -EINVAL;
1196
1197    if (mode == MPOL_DEFAULT)
1198        flags &= ~MPOL_MF_STRICT;
1199
1200    len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1201    end = start + len;
1202
1203    if (end < start)
1204        return -EINVAL;
1205    if (end == start)
1206        return 0;
1207
1208    new = mpol_new(mode, mode_flags, nmask);
1209    if (IS_ERR(new))
1210        return PTR_ERR(new);
1211
1212    if (flags & MPOL_MF_LAZY)
1213        new->flags |= MPOL_F_MOF;
1214
1215    /*
1216     * If we are using the default policy then operation
1217     * on discontinuous address spaces is okay after all
1218     */
1219    if (!new)
1220        flags |= MPOL_MF_DISCONTIG_OK;
1221
1222    pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1223         start, start + len, mode, mode_flags,
1224         nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1225
1226    if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1227
1228        err = migrate_prep();
1229        if (err)
1230            goto mpol_out;
1231    }
1232    {
1233        NODEMASK_SCRATCH(scratch);
1234        if (scratch) {
1235            down_write(&mm->mmap_sem);
1236            task_lock(current);
1237            err = mpol_set_nodemask(new, nmask, scratch);
1238            task_unlock(current);
1239            if (err)
1240                up_write(&mm->mmap_sem);
1241        } else
1242            err = -ENOMEM;
1243        NODEMASK_SCRATCH_FREE(scratch);
1244    }
1245    if (err)
1246        goto mpol_out;
1247
1248    vma = check_range(mm, start, end, nmask,
1249              flags | MPOL_MF_INVERT, &pagelist);
1250
1251    err = PTR_ERR(vma); /* maybe ... */
1252    if (!IS_ERR(vma))
1253        err = mbind_range(mm, start, end, new);
1254
1255    if (!err) {
1256        int nr_failed = 0;
1257
1258        if (!list_empty(&pagelist)) {
1259            WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1260            nr_failed = migrate_pages(&pagelist, new_vma_page,
1261                    (unsigned long)vma,
1262                    MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1263            if (nr_failed)
1264                putback_lru_pages(&pagelist);
1265        }
1266
1267        if (nr_failed && (flags & MPOL_MF_STRICT))
1268            err = -EIO;
1269    } else
1270        putback_lru_pages(&pagelist);
1271
1272    up_write(&mm->mmap_sem);
1273 mpol_out:
1274    mpol_put(new);
1275    return err;
1276}
1277
1278/*
1279 * User space interface with variable sized bitmaps for nodelists.
1280 */
1281
1282/* Copy a node mask from user space. */
1283static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1284             unsigned long maxnode)
1285{
1286    unsigned long k;
1287    unsigned long nlongs;
1288    unsigned long endmask;
1289
1290    --maxnode;
1291    nodes_clear(*nodes);
1292    if (maxnode == 0 || !nmask)
1293        return 0;
1294    if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1295        return -EINVAL;
1296
1297    nlongs = BITS_TO_LONGS(maxnode);
1298    if ((maxnode % BITS_PER_LONG) == 0)
1299        endmask = ~0UL;
1300    else
1301        endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1302
1303    /* When the user specified more nodes than supported just check
1304       if the non supported part is all zero. */
1305    if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1306        if (nlongs > PAGE_SIZE/sizeof(long))
1307            return -EINVAL;
1308        for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1309            unsigned long t;
1310            if (get_user(t, nmask + k))
1311                return -EFAULT;
1312            if (k == nlongs - 1) {
1313                if (t & endmask)
1314                    return -EINVAL;
1315            } else if (t)
1316                return -EINVAL;
1317        }
1318        nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1319        endmask = ~0UL;
1320    }
1321
1322    if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1323        return -EFAULT;
1324    nodes_addr(*nodes)[nlongs-1] &= endmask;
1325    return 0;
1326}
1327
1328/* Copy a kernel node mask to user space */
1329static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1330                  nodemask_t *nodes)
1331{
1332    unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1333    const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1334
1335    if (copy > nbytes) {
1336        if (copy > PAGE_SIZE)
1337            return -EINVAL;
1338        if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1339            return -EFAULT;
1340        copy = nbytes;
1341    }
1342    return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1343}
1344
1345SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1346        unsigned long, mode, unsigned long __user *, nmask,
1347        unsigned long, maxnode, unsigned, flags)
1348{
1349    nodemask_t nodes;
1350    int err;
1351    unsigned short mode_flags;
1352
1353    mode_flags = mode & MPOL_MODE_FLAGS;
1354    mode &= ~MPOL_MODE_FLAGS;
1355    if (mode >= MPOL_MAX)
1356        return -EINVAL;
1357    if ((mode_flags & MPOL_F_STATIC_NODES) &&
1358        (mode_flags & MPOL_F_RELATIVE_NODES))
1359        return -EINVAL;
1360    err = get_nodes(&nodes, nmask, maxnode);
1361    if (err)
1362        return err;
1363    return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1364}
1365
1366/* Set the process memory policy */
1367SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1368        unsigned long, maxnode)
1369{
1370    int err;
1371    nodemask_t nodes;
1372    unsigned short flags;
1373
1374    flags = mode & MPOL_MODE_FLAGS;
1375    mode &= ~MPOL_MODE_FLAGS;
1376    if ((unsigned int)mode >= MPOL_MAX)
1377        return -EINVAL;
1378    if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1379        return -EINVAL;
1380    err = get_nodes(&nodes, nmask, maxnode);
1381    if (err)
1382        return err;
1383    return do_set_mempolicy(mode, flags, &nodes);
1384}
1385
1386SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1387        const unsigned long __user *, old_nodes,
1388        const unsigned long __user *, new_nodes)
1389{
1390    const struct cred *cred = current_cred(), *tcred;
1391    struct mm_struct *mm = NULL;
1392    struct task_struct *task;
1393    nodemask_t task_nodes;
1394    int err;
1395    nodemask_t *old;
1396    nodemask_t *new;
1397    NODEMASK_SCRATCH(scratch);
1398
1399    if (!scratch)
1400        return -ENOMEM;
1401
1402    old = &scratch->mask1;
1403    new = &scratch->mask2;
1404
1405    err = get_nodes(old, old_nodes, maxnode);
1406    if (err)
1407        goto out;
1408
1409    err = get_nodes(new, new_nodes, maxnode);
1410    if (err)
1411        goto out;
1412
1413    /* Find the mm_struct */
1414    rcu_read_lock();
1415    task = pid ? find_task_by_vpid(pid) : current;
1416    if (!task) {
1417        rcu_read_unlock();
1418        err = -ESRCH;
1419        goto out;
1420    }
1421    get_task_struct(task);
1422
1423    err = -EINVAL;
1424
1425    /*
1426     * Check if this process has the right to modify the specified
1427     * process. The right exists if the process has administrative
1428     * capabilities, superuser privileges or the same
1429     * userid as the target process.
1430     */
1431    tcred = __task_cred(task);
1432    if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1433        !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1434        !capable(CAP_SYS_NICE)) {
1435        rcu_read_unlock();
1436        err = -EPERM;
1437        goto out_put;
1438    }
1439    rcu_read_unlock();
1440
1441    task_nodes = cpuset_mems_allowed(task);
1442    /* Is the user allowed to access the target nodes? */
1443    if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1444        err = -EPERM;
1445        goto out_put;
1446    }
1447
1448    if (!nodes_subset(*new, node_states[N_MEMORY])) {
1449        err = -EINVAL;
1450        goto out_put;
1451    }
1452
1453    err = security_task_movememory(task);
1454    if (err)
1455        goto out_put;
1456
1457    mm = get_task_mm(task);
1458    put_task_struct(task);
1459
1460    if (!mm) {
1461        err = -EINVAL;
1462        goto out;
1463    }
1464
1465    err = do_migrate_pages(mm, old, new,
1466        capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1467
1468    mmput(mm);
1469out:
1470    NODEMASK_SCRATCH_FREE(scratch);
1471
1472    return err;
1473
1474out_put:
1475    put_task_struct(task);
1476    goto out;
1477
1478}
1479
1480
1481/* Retrieve NUMA policy */
1482SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1483        unsigned long __user *, nmask, unsigned long, maxnode,
1484        unsigned long, addr, unsigned long, flags)
1485{
1486    int err;
1487    int uninitialized_var(pval);
1488    nodemask_t nodes;
1489
1490    if (nmask != NULL && maxnode < MAX_NUMNODES)
1491        return -EINVAL;
1492
1493    err = do_get_mempolicy(&pval, &nodes, addr, flags);
1494
1495    if (err)
1496        return err;
1497
1498    if (policy && put_user(pval, policy))
1499        return -EFAULT;
1500
1501    if (nmask)
1502        err = copy_nodes_to_user(nmask, maxnode, &nodes);
1503
1504    return err;
1505}
1506
1507#ifdef CONFIG_COMPAT
1508
1509asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1510                     compat_ulong_t __user *nmask,
1511                     compat_ulong_t maxnode,
1512                     compat_ulong_t addr, compat_ulong_t flags)
1513{
1514    long err;
1515    unsigned long __user *nm = NULL;
1516    unsigned long nr_bits, alloc_size;
1517    DECLARE_BITMAP(bm, MAX_NUMNODES);
1518
1519    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1520    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1521
1522    if (nmask)
1523        nm = compat_alloc_user_space(alloc_size);
1524
1525    err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1526
1527    if (!err && nmask) {
1528        unsigned long copy_size;
1529        copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1530        err = copy_from_user(bm, nm, copy_size);
1531        /* ensure entire bitmap is zeroed */
1532        err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1533        err |= compat_put_bitmap(nmask, bm, nr_bits);
1534    }
1535
1536    return err;
1537}
1538
1539asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1540                     compat_ulong_t maxnode)
1541{
1542    long err = 0;
1543    unsigned long __user *nm = NULL;
1544    unsigned long nr_bits, alloc_size;
1545    DECLARE_BITMAP(bm, MAX_NUMNODES);
1546
1547    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1548    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1549
1550    if (nmask) {
1551        err = compat_get_bitmap(bm, nmask, nr_bits);
1552        nm = compat_alloc_user_space(alloc_size);
1553        err |= copy_to_user(nm, bm, alloc_size);
1554    }
1555
1556    if (err)
1557        return -EFAULT;
1558
1559    return sys_set_mempolicy(mode, nm, nr_bits+1);
1560}
1561
1562asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1563                 compat_ulong_t mode, compat_ulong_t __user *nmask,
1564                 compat_ulong_t maxnode, compat_ulong_t flags)
1565{
1566    long err = 0;
1567    unsigned long __user *nm = NULL;
1568    unsigned long nr_bits, alloc_size;
1569    nodemask_t bm;
1570
1571    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1572    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1573
1574    if (nmask) {
1575        err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1576        nm = compat_alloc_user_space(alloc_size);
1577        err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1578    }
1579
1580    if (err)
1581        return -EFAULT;
1582
1583    return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1584}
1585
1586#endif
1587
1588/*
1589 * get_vma_policy(@task, @vma, @addr)
1590 * @task - task for fallback if vma policy == default
1591 * @vma - virtual memory area whose policy is sought
1592 * @addr - address in @vma for shared policy lookup
1593 *
1594 * Returns effective policy for a VMA at specified address.
1595 * Falls back to @task or system default policy, as necessary.
1596 * Current or other task's task mempolicy and non-shared vma policies must be
1597 * protected by task_lock(task) by the caller.
1598 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1599 * count--added by the get_policy() vm_op, as appropriate--to protect against
1600 * freeing by another task. It is the caller's responsibility to free the
1601 * extra reference for shared policies.
1602 */
1603struct mempolicy *get_vma_policy(struct task_struct *task,
1604        struct vm_area_struct *vma, unsigned long addr)
1605{
1606    struct mempolicy *pol = get_task_policy(task);
1607
1608    if (vma) {
1609        if (vma->vm_ops && vma->vm_ops->get_policy) {
1610            struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1611                                    addr);
1612            if (vpol)
1613                pol = vpol;
1614        } else if (vma->vm_policy) {
1615            pol = vma->vm_policy;
1616
1617            /*
1618             * shmem_alloc_page() passes MPOL_F_SHARED policy with
1619             * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1620             * count on these policies which will be dropped by
1621             * mpol_cond_put() later
1622             */
1623            if (mpol_needs_cond_ref(pol))
1624                mpol_get(pol);
1625        }
1626    }
1627    if (!pol)
1628        pol = &default_policy;
1629    return pol;
1630}
1631
1632static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1633{
1634    enum zone_type dynamic_policy_zone = policy_zone;
1635
1636    BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1637
1638    /*
1639     * if policy->v.nodes has movable memory only,
1640     * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1641     *
1642     * policy->v.nodes is intersect with node_states[N_MEMORY].
1643     * so if the following test faile, it implies
1644     * policy->v.nodes has movable memory only.
1645     */
1646    if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1647        dynamic_policy_zone = ZONE_MOVABLE;
1648
1649    return zone >= dynamic_policy_zone;
1650}
1651
1652/*
1653 * Return a nodemask representing a mempolicy for filtering nodes for
1654 * page allocation
1655 */
1656static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1657{
1658    /* Lower zones don't get a nodemask applied for MPOL_BIND */
1659    if (unlikely(policy->mode == MPOL_BIND) &&
1660            apply_policy_zone(policy, gfp_zone(gfp)) &&
1661            cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1662        return &policy->v.nodes;
1663
1664    return NULL;
1665}
1666
1667/* Return a zonelist indicated by gfp for node representing a mempolicy */
1668static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1669    int nd)
1670{
1671    switch (policy->mode) {
1672    case MPOL_PREFERRED:
1673        if (!(policy->flags & MPOL_F_LOCAL))
1674            nd = policy->v.preferred_node;
1675        break;
1676    case MPOL_BIND:
1677        /*
1678         * Normally, MPOL_BIND allocations are node-local within the
1679         * allowed nodemask. However, if __GFP_THISNODE is set and the
1680         * current node isn't part of the mask, we use the zonelist for
1681         * the first node in the mask instead.
1682         */
1683        if (unlikely(gfp & __GFP_THISNODE) &&
1684                unlikely(!node_isset(nd, policy->v.nodes)))
1685            nd = first_node(policy->v.nodes);
1686        break;
1687    default:
1688        BUG();
1689    }
1690    return node_zonelist(nd, gfp);
1691}
1692
1693/* Do dynamic interleaving for a process */
1694static unsigned interleave_nodes(struct mempolicy *policy)
1695{
1696    unsigned nid, next;
1697    struct task_struct *me = current;
1698
1699    nid = me->il_next;
1700    next = next_node(nid, policy->v.nodes);
1701    if (next >= MAX_NUMNODES)
1702        next = first_node(policy->v.nodes);
1703    if (next < MAX_NUMNODES)
1704        me->il_next = next;
1705    return nid;
1706}
1707
1708/*
1709 * Depending on the memory policy provide a node from which to allocate the
1710 * next slab entry.
1711 * @policy must be protected by freeing by the caller. If @policy is
1712 * the current task's mempolicy, this protection is implicit, as only the
1713 * task can change it's policy. The system default policy requires no
1714 * such protection.
1715 */
1716unsigned slab_node(void)
1717{
1718    struct mempolicy *policy;
1719
1720    if (in_interrupt())
1721        return numa_node_id();
1722
1723    policy = current->mempolicy;
1724    if (!policy || policy->flags & MPOL_F_LOCAL)
1725        return numa_node_id();
1726
1727    switch (policy->mode) {
1728    case MPOL_PREFERRED:
1729        /*
1730         * handled MPOL_F_LOCAL above
1731         */
1732        return policy->v.preferred_node;
1733
1734    case MPOL_INTERLEAVE:
1735        return interleave_nodes(policy);
1736
1737    case MPOL_BIND: {
1738        /*
1739         * Follow bind policy behavior and start allocation at the
1740         * first node.
1741         */
1742        struct zonelist *zonelist;
1743        struct zone *zone;
1744        enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1745        zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1746        (void)first_zones_zonelist(zonelist, highest_zoneidx,
1747                            &policy->v.nodes,
1748                            &zone);
1749        return zone ? zone->node : numa_node_id();
1750    }
1751
1752    default:
1753        BUG();
1754    }
1755}
1756
1757/* Do static interleaving for a VMA with known offset. */
1758static unsigned offset_il_node(struct mempolicy *pol,
1759        struct vm_area_struct *vma, unsigned long off)
1760{
1761    unsigned nnodes = nodes_weight(pol->v.nodes);
1762    unsigned target;
1763    int c;
1764    int nid = -1;
1765
1766    if (!nnodes)
1767        return numa_node_id();
1768    target = (unsigned int)off % nnodes;
1769    c = 0;
1770    do {
1771        nid = next_node(nid, pol->v.nodes);
1772        c++;
1773    } while (c <= target);
1774    return nid;
1775}
1776
1777/* Determine a node number for interleave */
1778static inline unsigned interleave_nid(struct mempolicy *pol,
1779         struct vm_area_struct *vma, unsigned long addr, int shift)
1780{
1781    if (vma) {
1782        unsigned long off;
1783
1784        /*
1785         * for small pages, there is no difference between
1786         * shift and PAGE_SHIFT, so the bit-shift is safe.
1787         * for huge pages, since vm_pgoff is in units of small
1788         * pages, we need to shift off the always 0 bits to get
1789         * a useful offset.
1790         */
1791        BUG_ON(shift < PAGE_SHIFT);
1792        off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1793        off += (addr - vma->vm_start) >> shift;
1794        return offset_il_node(pol, vma, off);
1795    } else
1796        return interleave_nodes(pol);
1797}
1798
1799/*
1800 * Return the bit number of a random bit set in the nodemask.
1801 * (returns -1 if nodemask is empty)
1802 */
1803int node_random(const nodemask_t *maskp)
1804{
1805    int w, bit = -1;
1806
1807    w = nodes_weight(*maskp);
1808    if (w)
1809        bit = bitmap_ord_to_pos(maskp->bits,
1810            get_random_int() % w, MAX_NUMNODES);
1811    return bit;
1812}
1813
1814#ifdef CONFIG_HUGETLBFS
1815/*
1816 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1817 * @vma = virtual memory area whose policy is sought
1818 * @addr = address in @vma for shared policy lookup and interleave policy
1819 * @gfp_flags = for requested zone
1820 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1821 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1822 *
1823 * Returns a zonelist suitable for a huge page allocation and a pointer
1824 * to the struct mempolicy for conditional unref after allocation.
1825 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1826 * @nodemask for filtering the zonelist.
1827 *
1828 * Must be protected by get_mems_allowed()
1829 */
1830struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1831                gfp_t gfp_flags, struct mempolicy **mpol,
1832                nodemask_t **nodemask)
1833{
1834    struct zonelist *zl;
1835
1836    *mpol = get_vma_policy(current, vma, addr);
1837    *nodemask = NULL; /* assume !MPOL_BIND */
1838
1839    if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1840        zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1841                huge_page_shift(hstate_vma(vma))), gfp_flags);
1842    } else {
1843        zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1844        if ((*mpol)->mode == MPOL_BIND)
1845            *nodemask = &(*mpol)->v.nodes;
1846    }
1847    return zl;
1848}
1849
1850/*
1851 * init_nodemask_of_mempolicy
1852 *
1853 * If the current task's mempolicy is "default" [NULL], return 'false'
1854 * to indicate default policy. Otherwise, extract the policy nodemask
1855 * for 'bind' or 'interleave' policy into the argument nodemask, or
1856 * initialize the argument nodemask to contain the single node for
1857 * 'preferred' or 'local' policy and return 'true' to indicate presence
1858 * of non-default mempolicy.
1859 *
1860 * We don't bother with reference counting the mempolicy [mpol_get/put]
1861 * because the current task is examining it's own mempolicy and a task's
1862 * mempolicy is only ever changed by the task itself.
1863 *
1864 * N.B., it is the caller's responsibility to free a returned nodemask.
1865 */
1866bool init_nodemask_of_mempolicy(nodemask_t *mask)
1867{
1868    struct mempolicy *mempolicy;
1869    int nid;
1870
1871    if (!(mask && current->mempolicy))
1872        return false;
1873
1874    task_lock(current);
1875    mempolicy = current->mempolicy;
1876    switch (mempolicy->mode) {
1877    case MPOL_PREFERRED:
1878        if (mempolicy->flags & MPOL_F_LOCAL)
1879            nid = numa_node_id();
1880        else
1881            nid = mempolicy->v.preferred_node;
1882        init_nodemask_of_node(mask, nid);
1883        break;
1884
1885    case MPOL_BIND:
1886        /* Fall through */
1887    case MPOL_INTERLEAVE:
1888        *mask = mempolicy->v.nodes;
1889        break;
1890
1891    default:
1892        BUG();
1893    }
1894    task_unlock(current);
1895
1896    return true;
1897}
1898#endif
1899
1900/*
1901 * mempolicy_nodemask_intersects
1902 *
1903 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1904 * policy. Otherwise, check for intersection between mask and the policy
1905 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1906 * policy, always return true since it may allocate elsewhere on fallback.
1907 *
1908 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1909 */
1910bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1911                    const nodemask_t *mask)
1912{
1913    struct mempolicy *mempolicy;
1914    bool ret = true;
1915
1916    if (!mask)
1917        return ret;
1918    task_lock(tsk);
1919    mempolicy = tsk->mempolicy;
1920    if (!mempolicy)
1921        goto out;
1922
1923    switch (mempolicy->mode) {
1924    case MPOL_PREFERRED:
1925        /*
1926         * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1927         * allocate from, they may fallback to other nodes when oom.
1928         * Thus, it's possible for tsk to have allocated memory from
1929         * nodes in mask.
1930         */
1931        break;
1932    case MPOL_BIND:
1933    case MPOL_INTERLEAVE:
1934        ret = nodes_intersects(mempolicy->v.nodes, *mask);
1935        break;
1936    default:
1937        BUG();
1938    }
1939out:
1940    task_unlock(tsk);
1941    return ret;
1942}
1943
1944/* Allocate a page in interleaved policy.
1945   Own path because it needs to do special accounting. */
1946static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1947                    unsigned nid)
1948{
1949    struct zonelist *zl;
1950    struct page *page;
1951
1952    zl = node_zonelist(nid, gfp);
1953    page = __alloc_pages(gfp, order, zl);
1954    if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1955        inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1956    return page;
1957}
1958
1959/**
1960 * alloc_pages_vma - Allocate a page for a VMA.
1961 *
1962 * @gfp:
1963 * %GFP_USER user allocation.
1964 * %GFP_KERNEL kernel allocations,
1965 * %GFP_HIGHMEM highmem/user allocations,
1966 * %GFP_FS allocation should not call back into a file system.
1967 * %GFP_ATOMIC don't sleep.
1968 *
1969 * @order:Order of the GFP allocation.
1970 * @vma: Pointer to VMA or NULL if not available.
1971 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1972 *
1973 * This function allocates a page from the kernel page pool and applies
1974 * a NUMA policy associated with the VMA or the current process.
1975 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1976 * mm_struct of the VMA to prevent it from going away. Should be used for
1977 * all allocations for pages that will be mapped into
1978 * user space. Returns NULL when no page can be allocated.
1979 *
1980 * Should be called with the mm_sem of the vma hold.
1981 */
1982struct page *
1983alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1984        unsigned long addr, int node)
1985{
1986    struct mempolicy *pol;
1987    struct page *page;
1988    unsigned int cpuset_mems_cookie;
1989
1990retry_cpuset:
1991    pol = get_vma_policy(current, vma, addr);
1992    cpuset_mems_cookie = get_mems_allowed();
1993
1994    if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1995        unsigned nid;
1996
1997        nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1998        mpol_cond_put(pol);
1999        page = alloc_page_interleave(gfp, order, nid);
2000        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2001            goto retry_cpuset;
2002
2003        return page;
2004    }
2005    page = __alloc_pages_nodemask(gfp, order,
2006                      policy_zonelist(gfp, pol, node),
2007                      policy_nodemask(gfp, pol));
2008    if (unlikely(mpol_needs_cond_ref(pol)))
2009        __mpol_put(pol);
2010    if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2011        goto retry_cpuset;
2012    return page;
2013}
2014
2015/**
2016 * alloc_pages_current - Allocate pages.
2017 *
2018 * @gfp:
2019 * %GFP_USER user allocation,
2020 * %GFP_KERNEL kernel allocation,
2021 * %GFP_HIGHMEM highmem allocation,
2022 * %GFP_FS don't call back into a file system.
2023 * %GFP_ATOMIC don't sleep.
2024 * @order: Power of two of allocation size in pages. 0 is a single page.
2025 *
2026 * Allocate a page from the kernel page pool. When not in
2027 * interrupt context and apply the current process NUMA policy.
2028 * Returns NULL when no page can be allocated.
2029 *
2030 * Don't call cpuset_update_task_memory_state() unless
2031 * 1) it's ok to take cpuset_sem (can WAIT), and
2032 * 2) allocating for current task (not interrupt).
2033 */
2034struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2035{
2036    struct mempolicy *pol = get_task_policy(current);
2037    struct page *page;
2038    unsigned int cpuset_mems_cookie;
2039
2040    if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2041        pol = &default_policy;
2042
2043retry_cpuset:
2044    cpuset_mems_cookie = get_mems_allowed();
2045
2046    /*
2047     * No reference counting needed for current->mempolicy
2048     * nor system default_policy
2049     */
2050    if (pol->mode == MPOL_INTERLEAVE)
2051        page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2052    else
2053        page = __alloc_pages_nodemask(gfp, order,
2054                policy_zonelist(gfp, pol, numa_node_id()),
2055                policy_nodemask(gfp, pol));
2056
2057    if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2058        goto retry_cpuset;
2059
2060    return page;
2061}
2062EXPORT_SYMBOL(alloc_pages_current);
2063
2064/*
2065 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2066 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2067 * with the mems_allowed returned by cpuset_mems_allowed(). This
2068 * keeps mempolicies cpuset relative after its cpuset moves. See
2069 * further kernel/cpuset.c update_nodemask().
2070 *
2071 * current's mempolicy may be rebinded by the other task(the task that changes
2072 * cpuset's mems), so we needn't do rebind work for current task.
2073 */
2074
2075/* Slow path of a mempolicy duplicate */
2076struct mempolicy *__mpol_dup(struct mempolicy *old)
2077{
2078    struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2079
2080    if (!new)
2081        return ERR_PTR(-ENOMEM);
2082
2083    /* task's mempolicy is protected by alloc_lock */
2084    if (old == current->mempolicy) {
2085        task_lock(current);
2086        *new = *old;
2087        task_unlock(current);
2088    } else
2089        *new = *old;
2090
2091    rcu_read_lock();
2092    if (current_cpuset_is_being_rebound()) {
2093        nodemask_t mems = cpuset_mems_allowed(current);
2094        if (new->flags & MPOL_F_REBINDING)
2095            mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2096        else
2097            mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2098    }
2099    rcu_read_unlock();
2100    atomic_set(&new->refcnt, 1);
2101    return new;
2102}
2103
2104/* Slow path of a mempolicy comparison */
2105bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2106{
2107    if (!a || !b)
2108        return false;
2109    if (a->mode != b->mode)
2110        return false;
2111    if (a->flags != b->flags)
2112        return false;
2113    if (mpol_store_user_nodemask(a))
2114        if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2115            return false;
2116
2117    switch (a->mode) {
2118    case MPOL_BIND:
2119        /* Fall through */
2120    case MPOL_INTERLEAVE:
2121        return !!nodes_equal(a->v.nodes, b->v.nodes);
2122    case MPOL_PREFERRED:
2123        return a->v.preferred_node == b->v.preferred_node;
2124    default:
2125        BUG();
2126        return false;
2127    }
2128}
2129
2130/*
2131 * Shared memory backing store policy support.
2132 *
2133 * Remember policies even when nobody has shared memory mapped.
2134 * The policies are kept in Red-Black tree linked from the inode.
2135 * They are protected by the sp->lock spinlock, which should be held
2136 * for any accesses to the tree.
2137 */
2138
2139/* lookup first element intersecting start-end */
2140/* Caller holds sp->lock */
2141static struct sp_node *
2142sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2143{
2144    struct rb_node *n = sp->root.rb_node;
2145
2146    while (n) {
2147        struct sp_node *p = rb_entry(n, struct sp_node, nd);
2148
2149        if (start >= p->end)
2150            n = n->rb_right;
2151        else if (end <= p->start)
2152            n = n->rb_left;
2153        else
2154            break;
2155    }
2156    if (!n)
2157        return NULL;
2158    for (;;) {
2159        struct sp_node *w = NULL;
2160        struct rb_node *prev = rb_prev(n);
2161        if (!prev)
2162            break;
2163        w = rb_entry(prev, struct sp_node, nd);
2164        if (w->end <= start)
2165            break;
2166        n = prev;
2167    }
2168    return rb_entry(n, struct sp_node, nd);
2169}
2170
2171/* Insert a new shared policy into the list. */
2172/* Caller holds sp->lock */
2173static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2174{
2175    struct rb_node **p = &sp->root.rb_node;
2176    struct rb_node *parent = NULL;
2177    struct sp_node *nd;
2178
2179    while (*p) {
2180        parent = *p;
2181        nd = rb_entry(parent, struct sp_node, nd);
2182        if (new->start < nd->start)
2183            p = &(*p)->rb_left;
2184        else if (new->end > nd->end)
2185            p = &(*p)->rb_right;
2186        else
2187            BUG();
2188    }
2189    rb_link_node(&new->nd, parent, p);
2190    rb_insert_color(&new->nd, &sp->root);
2191    pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2192         new->policy ? new->policy->mode : 0);
2193}
2194
2195/* Find shared policy intersecting idx */
2196struct mempolicy *
2197mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2198{
2199    struct mempolicy *pol = NULL;
2200    struct sp_node *sn;
2201
2202    if (!sp->root.rb_node)
2203        return NULL;
2204    spin_lock(&sp->lock);
2205    sn = sp_lookup(sp, idx, idx+1);
2206    if (sn) {
2207        mpol_get(sn->policy);
2208        pol = sn->policy;
2209    }
2210    spin_unlock(&sp->lock);
2211    return pol;
2212}
2213
2214static void sp_free(struct sp_node *n)
2215{
2216    mpol_put(n->policy);
2217    kmem_cache_free(sn_cache, n);
2218}
2219
2220/**
2221 * mpol_misplaced - check whether current page node is valid in policy
2222 *
2223 * @page - page to be checked
2224 * @vma - vm area where page mapped
2225 * @addr - virtual address where page mapped
2226 *
2227 * Lookup current policy node id for vma,addr and "compare to" page's
2228 * node id.
2229 *
2230 * Returns:
2231 * -1 - not misplaced, page is in the right node
2232 * node - node id where the page should be
2233 *
2234 * Policy determination "mimics" alloc_page_vma().
2235 * Called from fault path where we know the vma and faulting address.
2236 */
2237int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2238{
2239    struct mempolicy *pol;
2240    struct zone *zone;
2241    int curnid = page_to_nid(page);
2242    unsigned long pgoff;
2243    int polnid = -1;
2244    int ret = -1;
2245
2246    BUG_ON(!vma);
2247
2248    pol = get_vma_policy(current, vma, addr);
2249    if (!(pol->flags & MPOL_F_MOF))
2250        goto out;
2251
2252    switch (pol->mode) {
2253    case MPOL_INTERLEAVE:
2254        BUG_ON(addr >= vma->vm_end);
2255        BUG_ON(addr < vma->vm_start);
2256
2257        pgoff = vma->vm_pgoff;
2258        pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2259        polnid = offset_il_node(pol, vma, pgoff);
2260        break;
2261
2262    case MPOL_PREFERRED:
2263        if (pol->flags & MPOL_F_LOCAL)
2264            polnid = numa_node_id();
2265        else
2266            polnid = pol->v.preferred_node;
2267        break;
2268
2269    case MPOL_BIND:
2270        /*
2271         * allows binding to multiple nodes.
2272         * use current page if in policy nodemask,
2273         * else select nearest allowed node, if any.
2274         * If no allowed nodes, use current [!misplaced].
2275         */
2276        if (node_isset(curnid, pol->v.nodes))
2277            goto out;
2278        (void)first_zones_zonelist(
2279                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2280                gfp_zone(GFP_HIGHUSER),
2281                &pol->v.nodes, &zone);
2282        polnid = zone->node;
2283        break;
2284
2285    default:
2286        BUG();
2287    }
2288
2289    /* Migrate the page towards the node whose CPU is referencing it */
2290    if (pol->flags & MPOL_F_MORON) {
2291        int last_nid;
2292
2293        polnid = numa_node_id();
2294
2295        /*
2296         * Multi-stage node selection is used in conjunction
2297         * with a periodic migration fault to build a temporal
2298         * task<->page relation. By using a two-stage filter we
2299         * remove short/unlikely relations.
2300         *
2301         * Using P(p) ~ n_p / n_t as per frequentist
2302         * probability, we can equate a task's usage of a
2303         * particular page (n_p) per total usage of this
2304         * page (n_t) (in a given time-span) to a probability.
2305         *
2306         * Our periodic faults will sample this probability and
2307         * getting the same result twice in a row, given these
2308         * samples are fully independent, is then given by
2309         * P(n)^2, provided our sample period is sufficiently
2310         * short compared to the usage pattern.
2311         *
2312         * This quadric squishes small probabilities, making
2313         * it less likely we act on an unlikely task<->page
2314         * relation.
2315         */
2316        last_nid = page_nid_xchg_last(page, polnid);
2317        if (last_nid != polnid)
2318            goto out;
2319    }
2320
2321    if (curnid != polnid)
2322        ret = polnid;
2323out:
2324    mpol_cond_put(pol);
2325
2326    return ret;
2327}
2328
2329static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2330{
2331    pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2332    rb_erase(&n->nd, &sp->root);
2333    sp_free(n);
2334}
2335
2336static void sp_node_init(struct sp_node *node, unsigned long start,
2337            unsigned long end, struct mempolicy *pol)
2338{
2339    node->start = start;
2340    node->end = end;
2341    node->policy = pol;
2342}
2343
2344static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2345                struct mempolicy *pol)
2346{
2347    struct sp_node *n;
2348    struct mempolicy *newpol;
2349
2350    n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2351    if (!n)
2352        return NULL;
2353
2354    newpol = mpol_dup(pol);
2355    if (IS_ERR(newpol)) {
2356        kmem_cache_free(sn_cache, n);
2357        return NULL;
2358    }
2359    newpol->flags |= MPOL_F_SHARED;
2360    sp_node_init(n, start, end, newpol);
2361
2362    return n;
2363}
2364
2365/* Replace a policy range. */
2366static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2367                 unsigned long end, struct sp_node *new)
2368{
2369    struct sp_node *n;
2370    struct sp_node *n_new = NULL;
2371    struct mempolicy *mpol_new = NULL;
2372    int ret = 0;
2373
2374restart:
2375    spin_lock(&sp->lock);
2376    n = sp_lookup(sp, start, end);
2377    /* Take care of old policies in the same range. */
2378    while (n && n->start < end) {
2379        struct rb_node *next = rb_next(&n->nd);
2380        if (n->start >= start) {
2381            if (n->end <= end)
2382                sp_delete(sp, n);
2383            else
2384                n->start = end;
2385        } else {
2386            /* Old policy spanning whole new range. */
2387            if (n->end > end) {
2388                if (!n_new)
2389                    goto alloc_new;
2390
2391                *mpol_new = *n->policy;
2392                atomic_set(&mpol_new->refcnt, 1);
2393                sp_node_init(n_new, end, n->end, mpol_new);
2394                n->end = start;
2395                sp_insert(sp, n_new);
2396                n_new = NULL;
2397                mpol_new = NULL;
2398                break;
2399            } else
2400                n->end = start;
2401        }
2402        if (!next)
2403            break;
2404        n = rb_entry(next, struct sp_node, nd);
2405    }
2406    if (new)
2407        sp_insert(sp, new);
2408    spin_unlock(&sp->lock);
2409    ret = 0;
2410
2411err_out:
2412    if (mpol_new)
2413        mpol_put(mpol_new);
2414    if (n_new)
2415        kmem_cache_free(sn_cache, n_new);
2416
2417    return ret;
2418
2419alloc_new:
2420    spin_unlock(&sp->lock);
2421    ret = -ENOMEM;
2422    n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2423    if (!n_new)
2424        goto err_out;
2425    mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2426    if (!mpol_new)
2427        goto err_out;
2428    goto restart;
2429}
2430
2431/**
2432 * mpol_shared_policy_init - initialize shared policy for inode
2433 * @sp: pointer to inode shared policy
2434 * @mpol: struct mempolicy to install
2435 *
2436 * Install non-NULL @mpol in inode's shared policy rb-tree.
2437 * On entry, the current task has a reference on a non-NULL @mpol.
2438 * This must be released on exit.
2439 * This is called at get_inode() calls and we can use GFP_KERNEL.
2440 */
2441void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2442{
2443    int ret;
2444
2445    sp->root = RB_ROOT; /* empty tree == default mempolicy */
2446    spin_lock_init(&sp->lock);
2447
2448    if (mpol) {
2449        struct vm_area_struct pvma;
2450        struct mempolicy *new;
2451        NODEMASK_SCRATCH(scratch);
2452
2453        if (!scratch)
2454            goto put_mpol;
2455        /* contextualize the tmpfs mount point mempolicy */
2456        new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2457        if (IS_ERR(new))
2458            goto free_scratch; /* no valid nodemask intersection */
2459
2460        task_lock(current);
2461        ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2462        task_unlock(current);
2463        if (ret)
2464            goto put_new;
2465
2466        /* Create pseudo-vma that contains just the policy */
2467        memset(&pvma, 0, sizeof(struct vm_area_struct));
2468        pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2469        mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2470
2471put_new:
2472        mpol_put(new); /* drop initial ref */
2473free_scratch:
2474        NODEMASK_SCRATCH_FREE(scratch);
2475put_mpol:
2476        mpol_put(mpol); /* drop our incoming ref on sb mpol */
2477    }
2478}
2479
2480int mpol_set_shared_policy(struct shared_policy *info,
2481            struct vm_area_struct *vma, struct mempolicy *npol)
2482{
2483    int err;
2484    struct sp_node *new = NULL;
2485    unsigned long sz = vma_pages(vma);
2486
2487    pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2488         vma->vm_pgoff,
2489         sz, npol ? npol->mode : -1,
2490         npol ? npol->flags : -1,
2491         npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2492
2493    if (npol) {
2494        new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2495        if (!new)
2496            return -ENOMEM;
2497    }
2498    err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2499    if (err && new)
2500        sp_free(new);
2501    return err;
2502}
2503
2504/* Free a backing policy store on inode delete. */
2505void mpol_free_shared_policy(struct shared_policy *p)
2506{
2507    struct sp_node *n;
2508    struct rb_node *next;
2509
2510    if (!p->root.rb_node)
2511        return;
2512    spin_lock(&p->lock);
2513    next = rb_first(&p->root);
2514    while (next) {
2515        n = rb_entry(next, struct sp_node, nd);
2516        next = rb_next(&n->nd);
2517        sp_delete(p, n);
2518    }
2519    spin_unlock(&p->lock);
2520}
2521
2522#ifdef CONFIG_NUMA_BALANCING
2523static bool __initdata numabalancing_override;
2524
2525static void __init check_numabalancing_enable(void)
2526{
2527    bool numabalancing_default = false;
2528
2529    if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2530        numabalancing_default = true;
2531
2532    if (nr_node_ids > 1 && !numabalancing_override) {
2533        printk(KERN_INFO "Enabling automatic NUMA balancing. "
2534            "Configure with numa_balancing= or sysctl");
2535        set_numabalancing_state(numabalancing_default);
2536    }
2537}
2538
2539static int __init setup_numabalancing(char *str)
2540{
2541    int ret = 0;
2542    if (!str)
2543        goto out;
2544    numabalancing_override = true;
2545
2546    if (!strcmp(str, "enable")) {
2547        set_numabalancing_state(true);
2548        ret = 1;
2549    } else if (!strcmp(str, "disable")) {
2550        set_numabalancing_state(false);
2551        ret = 1;
2552    }
2553out:
2554    if (!ret)
2555        printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2556
2557    return ret;
2558}
2559__setup("numa_balancing=", setup_numabalancing);
2560#else
2561static inline void __init check_numabalancing_enable(void)
2562{
2563}
2564#endif /* CONFIG_NUMA_BALANCING */
2565
2566/* assumes fs == KERNEL_DS */
2567void __init numa_policy_init(void)
2568{
2569    nodemask_t interleave_nodes;
2570    unsigned long largest = 0;
2571    int nid, prefer = 0;
2572
2573    policy_cache = kmem_cache_create("numa_policy",
2574                     sizeof(struct mempolicy),
2575                     0, SLAB_PANIC, NULL);
2576
2577    sn_cache = kmem_cache_create("shared_policy_node",
2578                     sizeof(struct sp_node),
2579                     0, SLAB_PANIC, NULL);
2580
2581    for_each_node(nid) {
2582        preferred_node_policy[nid] = (struct mempolicy) {
2583            .refcnt = ATOMIC_INIT(1),
2584            .mode = MPOL_PREFERRED,
2585            .flags = MPOL_F_MOF | MPOL_F_MORON,
2586            .v = { .preferred_node = nid, },
2587        };
2588    }
2589
2590    /*
2591     * Set interleaving policy for system init. Interleaving is only
2592     * enabled across suitably sized nodes (default is >= 16MB), or
2593     * fall back to the largest node if they're all smaller.
2594     */
2595    nodes_clear(interleave_nodes);
2596    for_each_node_state(nid, N_MEMORY) {
2597        unsigned long total_pages = node_present_pages(nid);
2598
2599        /* Preserve the largest node */
2600        if (largest < total_pages) {
2601            largest = total_pages;
2602            prefer = nid;
2603        }
2604
2605        /* Interleave this node? */
2606        if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2607            node_set(nid, interleave_nodes);
2608    }
2609
2610    /* All too small, use the largest */
2611    if (unlikely(nodes_empty(interleave_nodes)))
2612        node_set(prefer, interleave_nodes);
2613
2614    if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2615        printk("numa_policy_init: interleaving failed\n");
2616
2617    check_numabalancing_enable();
2618}
2619
2620/* Reset policy of current process to default */
2621void numa_default_policy(void)
2622{
2623    do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2624}
2625
2626/*
2627 * Parse and format mempolicy from/to strings
2628 */
2629
2630/*
2631 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2632 */
2633static const char * const policy_modes[] =
2634{
2635    [MPOL_DEFAULT] = "default",
2636    [MPOL_PREFERRED] = "prefer",
2637    [MPOL_BIND] = "bind",
2638    [MPOL_INTERLEAVE] = "interleave",
2639    [MPOL_LOCAL] = "local",
2640};
2641
2642
2643#ifdef CONFIG_TMPFS
2644/**
2645 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2646 * @str: string containing mempolicy to parse
2647 * @mpol: pointer to struct mempolicy pointer, returned on success.
2648 *
2649 * Format of input:
2650 * <mode>[=<flags>][:<nodelist>]
2651 *
2652 * On success, returns 0, else 1
2653 */
2654int mpol_parse_str(char *str, struct mempolicy **mpol)
2655{
2656    struct mempolicy *new = NULL;
2657    unsigned short mode;
2658    unsigned short mode_flags;
2659    nodemask_t nodes;
2660    char *nodelist = strchr(str, ':');
2661    char *flags = strchr(str, '=');
2662    int err = 1;
2663
2664    if (nodelist) {
2665        /* NUL-terminate mode or flags string */
2666        *nodelist++ = '\0';
2667        if (nodelist_parse(nodelist, nodes))
2668            goto out;
2669        if (!nodes_subset(nodes, node_states[N_MEMORY]))
2670            goto out;
2671    } else
2672        nodes_clear(nodes);
2673
2674    if (flags)
2675        *flags++ = '\0'; /* terminate mode string */
2676
2677    for (mode = 0; mode < MPOL_MAX; mode++) {
2678        if (!strcmp(str, policy_modes[mode])) {
2679            break;
2680        }
2681    }
2682    if (mode >= MPOL_MAX)
2683        goto out;
2684
2685    switch (mode) {
2686    case MPOL_PREFERRED:
2687        /*
2688         * Insist on a nodelist of one node only
2689         */
2690        if (nodelist) {
2691            char *rest = nodelist;
2692            while (isdigit(*rest))
2693                rest++;
2694            if (*rest)
2695                goto out;
2696        }
2697        break;
2698    case MPOL_INTERLEAVE:
2699        /*
2700         * Default to online nodes with memory if no nodelist
2701         */
2702        if (!nodelist)
2703            nodes = node_states[N_MEMORY];
2704        break;
2705    case MPOL_LOCAL:
2706        /*
2707         * Don't allow a nodelist; mpol_new() checks flags
2708         */
2709        if (nodelist)
2710            goto out;
2711        mode = MPOL_PREFERRED;
2712        break;
2713    case MPOL_DEFAULT:
2714        /*
2715         * Insist on a empty nodelist
2716         */
2717        if (!nodelist)
2718            err = 0;
2719        goto out;
2720    case MPOL_BIND:
2721        /*
2722         * Insist on a nodelist
2723         */
2724        if (!nodelist)
2725            goto out;
2726    }
2727
2728    mode_flags = 0;
2729    if (flags) {
2730        /*
2731         * Currently, we only support two mutually exclusive
2732         * mode flags.
2733         */
2734        if (!strcmp(flags, "static"))
2735            mode_flags |= MPOL_F_STATIC_NODES;
2736        else if (!strcmp(flags, "relative"))
2737            mode_flags |= MPOL_F_RELATIVE_NODES;
2738        else
2739            goto out;
2740    }
2741
2742    new = mpol_new(mode, mode_flags, &nodes);
2743    if (IS_ERR(new))
2744        goto out;
2745
2746    /*
2747     * Save nodes for mpol_to_str() to show the tmpfs mount options
2748     * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2749     */
2750    if (mode != MPOL_PREFERRED)
2751        new->v.nodes = nodes;
2752    else if (nodelist)
2753        new->v.preferred_node = first_node(nodes);
2754    else
2755        new->flags |= MPOL_F_LOCAL;
2756
2757    /*
2758     * Save nodes for contextualization: this will be used to "clone"
2759     * the mempolicy in a specific context [cpuset] at a later time.
2760     */
2761    new->w.user_nodemask = nodes;
2762
2763    err = 0;
2764
2765out:
2766    /* Restore string for error message */
2767    if (nodelist)
2768        *--nodelist = ':';
2769    if (flags)
2770        *--flags = '=';
2771    if (!err)
2772        *mpol = new;
2773    return err;
2774}
2775#endif /* CONFIG_TMPFS */
2776
2777/**
2778 * mpol_to_str - format a mempolicy structure for printing
2779 * @buffer: to contain formatted mempolicy string
2780 * @maxlen: length of @buffer
2781 * @pol: pointer to mempolicy to be formatted
2782 *
2783 * Convert a mempolicy into a string.
2784 * Returns the number of characters in buffer (if positive)
2785 * or an error (negative)
2786 */
2787int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2788{
2789    char *p = buffer;
2790    int l;
2791    nodemask_t nodes;
2792    unsigned short mode;
2793    unsigned short flags = pol ? pol->flags : 0;
2794
2795    /*
2796     * Sanity check: room for longest mode, flag and some nodes
2797     */
2798    VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2799
2800    if (!pol || pol == &default_policy)
2801        mode = MPOL_DEFAULT;
2802    else
2803        mode = pol->mode;
2804
2805    switch (mode) {
2806    case MPOL_DEFAULT:
2807        nodes_clear(nodes);
2808        break;
2809
2810    case MPOL_PREFERRED:
2811        nodes_clear(nodes);
2812        if (flags & MPOL_F_LOCAL)
2813            mode = MPOL_LOCAL;
2814        else
2815            node_set(pol->v.preferred_node, nodes);
2816        break;
2817
2818    case MPOL_BIND:
2819        /* Fall through */
2820    case MPOL_INTERLEAVE:
2821        nodes = pol->v.nodes;
2822        break;
2823
2824    default:
2825        return -EINVAL;
2826    }
2827
2828    l = strlen(policy_modes[mode]);
2829    if (buffer + maxlen < p + l + 1)
2830        return -ENOSPC;
2831
2832    strcpy(p, policy_modes[mode]);
2833    p += l;
2834
2835    if (flags & MPOL_MODE_FLAGS) {
2836        if (buffer + maxlen < p + 2)
2837            return -ENOSPC;
2838        *p++ = '=';
2839
2840        /*
2841         * Currently, the only defined flags are mutually exclusive
2842         */
2843        if (flags & MPOL_F_STATIC_NODES)
2844            p += snprintf(p, buffer + maxlen - p, "static");
2845        else if (flags & MPOL_F_RELATIVE_NODES)
2846            p += snprintf(p, buffer + maxlen - p, "relative");
2847    }
2848
2849    if (!nodes_empty(nodes)) {
2850        if (buffer + maxlen < p + 2)
2851            return -ENOSPC;
2852        *p++ = ':';
2853         p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2854    }
2855    return p - buffer;
2856}
2857

Archive Download this file



interactive