Root/mm/mempolicy.c

1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave Allocate memory interleaved over a set of nodes,
16 * with normal fallback if it fails.
17 * For VMA based allocations this interleaves based on the
18 * offset into the backing object or offset into the mapping
19 * for anonymous memory. For process policy an process counter
20 * is used.
21 *
22 * bind Only allocate memory on a specific set of nodes,
23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
28 * preferred Try a specific node first before normal fallback.
29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default
32 * process policy.
33 *
34 * default Allocate on the local node first, or when on a VMA
35 * use the process policy. This is what Linux always did
36 * in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#include <linux/mempolicy.h>
69#include <linux/mm.h>
70#include <linux/highmem.h>
71#include <linux/hugetlb.h>
72#include <linux/kernel.h>
73#include <linux/sched.h>
74#include <linux/nodemask.h>
75#include <linux/cpuset.h>
76#include <linux/slab.h>
77#include <linux/string.h>
78#include <linux/export.h>
79#include <linux/nsproxy.h>
80#include <linux/interrupt.h>
81#include <linux/init.h>
82#include <linux/compat.h>
83#include <linux/swap.h>
84#include <linux/seq_file.h>
85#include <linux/proc_fs.h>
86#include <linux/migrate.h>
87#include <linux/ksm.h>
88#include <linux/rmap.h>
89#include <linux/security.h>
90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92#include <linux/mm_inline.h>
93#include <linux/mmu_notifier.h>
94
95#include <asm/tlbflush.h>
96#include <asm/uaccess.h>
97#include <linux/random.h>
98
99#include "internal.h"
100
101/* Internal flags */
102#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
103#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
104
105static struct kmem_cache *policy_cache;
106static struct kmem_cache *sn_cache;
107
108/* Highest zone. An specific allocation for a zone below that is not
109   policied. */
110enum zone_type policy_zone = 0;
111
112/*
113 * run-time system-wide default policy => local allocation
114 */
115static struct mempolicy default_policy = {
116    .refcnt = ATOMIC_INIT(1), /* never free it */
117    .mode = MPOL_PREFERRED,
118    .flags = MPOL_F_LOCAL,
119};
120
121static struct mempolicy preferred_node_policy[MAX_NUMNODES];
122
123static struct mempolicy *get_task_policy(struct task_struct *p)
124{
125    struct mempolicy *pol = p->mempolicy;
126
127    if (!pol) {
128        int node = numa_node_id();
129
130        if (node != NUMA_NO_NODE) {
131            pol = &preferred_node_policy[node];
132            /*
133             * preferred_node_policy is not initialised early in
134             * boot
135             */
136            if (!pol->mode)
137                pol = NULL;
138        }
139    }
140
141    return pol;
142}
143
144static const struct mempolicy_operations {
145    int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
146    /*
147     * If read-side task has no lock to protect task->mempolicy, write-side
148     * task will rebind the task->mempolicy by two step. The first step is
149     * setting all the newly nodes, and the second step is cleaning all the
150     * disallowed nodes. In this way, we can avoid finding no node to alloc
151     * page.
152     * If we have a lock to protect task->mempolicy in read-side, we do
153     * rebind directly.
154     *
155     * step:
156     * MPOL_REBIND_ONCE - do rebind work at once
157     * MPOL_REBIND_STEP1 - set all the newly nodes
158     * MPOL_REBIND_STEP2 - clean all the disallowed nodes
159     */
160    void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
161            enum mpol_rebind_step step);
162} mpol_ops[MPOL_MAX];
163
164/* Check that the nodemask contains at least one populated zone */
165static int is_valid_nodemask(const nodemask_t *nodemask)
166{
167    return nodes_intersects(*nodemask, node_states[N_MEMORY]);
168}
169
170static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
171{
172    return pol->flags & MPOL_MODE_FLAGS;
173}
174
175static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
176                   const nodemask_t *rel)
177{
178    nodemask_t tmp;
179    nodes_fold(tmp, *orig, nodes_weight(*rel));
180    nodes_onto(*ret, tmp, *rel);
181}
182
183static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
184{
185    if (nodes_empty(*nodes))
186        return -EINVAL;
187    pol->v.nodes = *nodes;
188    return 0;
189}
190
191static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
192{
193    if (!nodes)
194        pol->flags |= MPOL_F_LOCAL; /* local allocation */
195    else if (nodes_empty(*nodes))
196        return -EINVAL; /* no allowed nodes */
197    else
198        pol->v.preferred_node = first_node(*nodes);
199    return 0;
200}
201
202static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
203{
204    if (!is_valid_nodemask(nodes))
205        return -EINVAL;
206    pol->v.nodes = *nodes;
207    return 0;
208}
209
210/*
211 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
212 * any, for the new policy. mpol_new() has already validated the nodes
213 * parameter with respect to the policy mode and flags. But, we need to
214 * handle an empty nodemask with MPOL_PREFERRED here.
215 *
216 * Must be called holding task's alloc_lock to protect task's mems_allowed
217 * and mempolicy. May also be called holding the mmap_semaphore for write.
218 */
219static int mpol_set_nodemask(struct mempolicy *pol,
220             const nodemask_t *nodes, struct nodemask_scratch *nsc)
221{
222    int ret;
223
224    /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
225    if (pol == NULL)
226        return 0;
227    /* Check N_MEMORY */
228    nodes_and(nsc->mask1,
229          cpuset_current_mems_allowed, node_states[N_MEMORY]);
230
231    VM_BUG_ON(!nodes);
232    if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
233        nodes = NULL; /* explicit local allocation */
234    else {
235        if (pol->flags & MPOL_F_RELATIVE_NODES)
236            mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
237        else
238            nodes_and(nsc->mask2, *nodes, nsc->mask1);
239
240        if (mpol_store_user_nodemask(pol))
241            pol->w.user_nodemask = *nodes;
242        else
243            pol->w.cpuset_mems_allowed =
244                        cpuset_current_mems_allowed;
245    }
246
247    if (nodes)
248        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
249    else
250        ret = mpol_ops[pol->mode].create(pol, NULL);
251    return ret;
252}
253
254/*
255 * This function just creates a new policy, does some check and simple
256 * initialization. You must invoke mpol_set_nodemask() to set nodes.
257 */
258static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
259                  nodemask_t *nodes)
260{
261    struct mempolicy *policy;
262
263    pr_debug("setting mode %d flags %d nodes[0] %lx\n",
264         mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
265
266    if (mode == MPOL_DEFAULT) {
267        if (nodes && !nodes_empty(*nodes))
268            return ERR_PTR(-EINVAL);
269        return NULL;
270    }
271    VM_BUG_ON(!nodes);
272
273    /*
274     * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
275     * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
276     * All other modes require a valid pointer to a non-empty nodemask.
277     */
278    if (mode == MPOL_PREFERRED) {
279        if (nodes_empty(*nodes)) {
280            if (((flags & MPOL_F_STATIC_NODES) ||
281                 (flags & MPOL_F_RELATIVE_NODES)))
282                return ERR_PTR(-EINVAL);
283        }
284    } else if (mode == MPOL_LOCAL) {
285        if (!nodes_empty(*nodes))
286            return ERR_PTR(-EINVAL);
287        mode = MPOL_PREFERRED;
288    } else if (nodes_empty(*nodes))
289        return ERR_PTR(-EINVAL);
290    policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
291    if (!policy)
292        return ERR_PTR(-ENOMEM);
293    atomic_set(&policy->refcnt, 1);
294    policy->mode = mode;
295    policy->flags = flags;
296
297    return policy;
298}
299
300/* Slow path of a mpol destructor. */
301void __mpol_put(struct mempolicy *p)
302{
303    if (!atomic_dec_and_test(&p->refcnt))
304        return;
305    kmem_cache_free(policy_cache, p);
306}
307
308static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
309                enum mpol_rebind_step step)
310{
311}
312
313/*
314 * step:
315 * MPOL_REBIND_ONCE - do rebind work at once
316 * MPOL_REBIND_STEP1 - set all the newly nodes
317 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
318 */
319static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
320                 enum mpol_rebind_step step)
321{
322    nodemask_t tmp;
323
324    if (pol->flags & MPOL_F_STATIC_NODES)
325        nodes_and(tmp, pol->w.user_nodemask, *nodes);
326    else if (pol->flags & MPOL_F_RELATIVE_NODES)
327        mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
328    else {
329        /*
330         * if step == 1, we use ->w.cpuset_mems_allowed to cache the
331         * result
332         */
333        if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
334            nodes_remap(tmp, pol->v.nodes,
335                    pol->w.cpuset_mems_allowed, *nodes);
336            pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
337        } else if (step == MPOL_REBIND_STEP2) {
338            tmp = pol->w.cpuset_mems_allowed;
339            pol->w.cpuset_mems_allowed = *nodes;
340        } else
341            BUG();
342    }
343
344    if (nodes_empty(tmp))
345        tmp = *nodes;
346
347    if (step == MPOL_REBIND_STEP1)
348        nodes_or(pol->v.nodes, pol->v.nodes, tmp);
349    else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
350        pol->v.nodes = tmp;
351    else
352        BUG();
353
354    if (!node_isset(current->il_next, tmp)) {
355        current->il_next = next_node(current->il_next, tmp);
356        if (current->il_next >= MAX_NUMNODES)
357            current->il_next = first_node(tmp);
358        if (current->il_next >= MAX_NUMNODES)
359            current->il_next = numa_node_id();
360    }
361}
362
363static void mpol_rebind_preferred(struct mempolicy *pol,
364                  const nodemask_t *nodes,
365                  enum mpol_rebind_step step)
366{
367    nodemask_t tmp;
368
369    if (pol->flags & MPOL_F_STATIC_NODES) {
370        int node = first_node(pol->w.user_nodemask);
371
372        if (node_isset(node, *nodes)) {
373            pol->v.preferred_node = node;
374            pol->flags &= ~MPOL_F_LOCAL;
375        } else
376            pol->flags |= MPOL_F_LOCAL;
377    } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
378        mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
379        pol->v.preferred_node = first_node(tmp);
380    } else if (!(pol->flags & MPOL_F_LOCAL)) {
381        pol->v.preferred_node = node_remap(pol->v.preferred_node,
382                           pol->w.cpuset_mems_allowed,
383                           *nodes);
384        pol->w.cpuset_mems_allowed = *nodes;
385    }
386}
387
388/*
389 * mpol_rebind_policy - Migrate a policy to a different set of nodes
390 *
391 * If read-side task has no lock to protect task->mempolicy, write-side
392 * task will rebind the task->mempolicy by two step. The first step is
393 * setting all the newly nodes, and the second step is cleaning all the
394 * disallowed nodes. In this way, we can avoid finding no node to alloc
395 * page.
396 * If we have a lock to protect task->mempolicy in read-side, we do
397 * rebind directly.
398 *
399 * step:
400 * MPOL_REBIND_ONCE - do rebind work at once
401 * MPOL_REBIND_STEP1 - set all the newly nodes
402 * MPOL_REBIND_STEP2 - clean all the disallowed nodes
403 */
404static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
405                enum mpol_rebind_step step)
406{
407    if (!pol)
408        return;
409    if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
410        nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
411        return;
412
413    if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
414        return;
415
416    if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
417        BUG();
418
419    if (step == MPOL_REBIND_STEP1)
420        pol->flags |= MPOL_F_REBINDING;
421    else if (step == MPOL_REBIND_STEP2)
422        pol->flags &= ~MPOL_F_REBINDING;
423    else if (step >= MPOL_REBIND_NSTEP)
424        BUG();
425
426    mpol_ops[pol->mode].rebind(pol, newmask, step);
427}
428
429/*
430 * Wrapper for mpol_rebind_policy() that just requires task
431 * pointer, and updates task mempolicy.
432 *
433 * Called with task's alloc_lock held.
434 */
435
436void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
437            enum mpol_rebind_step step)
438{
439    mpol_rebind_policy(tsk->mempolicy, new, step);
440}
441
442/*
443 * Rebind each vma in mm to new nodemask.
444 *
445 * Call holding a reference to mm. Takes mm->mmap_sem during call.
446 */
447
448void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
449{
450    struct vm_area_struct *vma;
451
452    down_write(&mm->mmap_sem);
453    for (vma = mm->mmap; vma; vma = vma->vm_next)
454        mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
455    up_write(&mm->mmap_sem);
456}
457
458static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
459    [MPOL_DEFAULT] = {
460        .rebind = mpol_rebind_default,
461    },
462    [MPOL_INTERLEAVE] = {
463        .create = mpol_new_interleave,
464        .rebind = mpol_rebind_nodemask,
465    },
466    [MPOL_PREFERRED] = {
467        .create = mpol_new_preferred,
468        .rebind = mpol_rebind_preferred,
469    },
470    [MPOL_BIND] = {
471        .create = mpol_new_bind,
472        .rebind = mpol_rebind_nodemask,
473    },
474};
475
476static void migrate_page_add(struct page *page, struct list_head *pagelist,
477                unsigned long flags);
478
479/*
480 * Scan through pages checking if pages follow certain conditions,
481 * and move them to the pagelist if they do.
482 */
483static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
484        unsigned long addr, unsigned long end,
485        const nodemask_t *nodes, unsigned long flags,
486        void *private)
487{
488    pte_t *orig_pte;
489    pte_t *pte;
490    spinlock_t *ptl;
491
492    orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
493    do {
494        struct page *page;
495        int nid;
496
497        if (!pte_present(*pte))
498            continue;
499        page = vm_normal_page(vma, addr, *pte);
500        if (!page)
501            continue;
502        /*
503         * vm_normal_page() filters out zero pages, but there might
504         * still be PageReserved pages to skip, perhaps in a VDSO.
505         */
506        if (PageReserved(page))
507            continue;
508        nid = page_to_nid(page);
509        if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
510            continue;
511
512        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
513            migrate_page_add(page, private, flags);
514        else
515            break;
516    } while (pte++, addr += PAGE_SIZE, addr != end);
517    pte_unmap_unlock(orig_pte, ptl);
518    return addr != end;
519}
520
521static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
522        pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523                    void *private)
524{
525#ifdef CONFIG_HUGETLB_PAGE
526    int nid;
527    struct page *page;
528
529    spin_lock(&vma->vm_mm->page_table_lock);
530    page = pte_page(huge_ptep_get((pte_t *)pmd));
531    nid = page_to_nid(page);
532    if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
533        goto unlock;
534    /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
535    if (flags & (MPOL_MF_MOVE_ALL) ||
536        (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537        isolate_huge_page(page, private);
538unlock:
539    spin_unlock(&vma->vm_mm->page_table_lock);
540#else
541    BUG();
542#endif
543}
544
545static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
546        unsigned long addr, unsigned long end,
547        const nodemask_t *nodes, unsigned long flags,
548        void *private)
549{
550    pmd_t *pmd;
551    unsigned long next;
552
553    pmd = pmd_offset(pud, addr);
554    do {
555        next = pmd_addr_end(addr, end);
556        if (!pmd_present(*pmd))
557            continue;
558        if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
559            queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
560                        flags, private);
561            continue;
562        }
563        split_huge_page_pmd(vma, addr, pmd);
564        if (pmd_none_or_trans_huge_or_clear_bad(pmd))
565            continue;
566        if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
567                    flags, private))
568            return -EIO;
569    } while (pmd++, addr = next, addr != end);
570    return 0;
571}
572
573static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
574        unsigned long addr, unsigned long end,
575        const nodemask_t *nodes, unsigned long flags,
576        void *private)
577{
578    pud_t *pud;
579    unsigned long next;
580
581    pud = pud_offset(pgd, addr);
582    do {
583        next = pud_addr_end(addr, end);
584        if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
585            continue;
586        if (pud_none_or_clear_bad(pud))
587            continue;
588        if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
589                    flags, private))
590            return -EIO;
591    } while (pud++, addr = next, addr != end);
592    return 0;
593}
594
595static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
596        unsigned long addr, unsigned long end,
597        const nodemask_t *nodes, unsigned long flags,
598        void *private)
599{
600    pgd_t *pgd;
601    unsigned long next;
602
603    pgd = pgd_offset(vma->vm_mm, addr);
604    do {
605        next = pgd_addr_end(addr, end);
606        if (pgd_none_or_clear_bad(pgd))
607            continue;
608        if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
609                    flags, private))
610            return -EIO;
611    } while (pgd++, addr = next, addr != end);
612    return 0;
613}
614
615#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
616/*
617 * This is used to mark a range of virtual addresses to be inaccessible.
618 * These are later cleared by a NUMA hinting fault. Depending on these
619 * faults, pages may be migrated for better NUMA placement.
620 *
621 * This is assuming that NUMA faults are handled using PROT_NONE. If
622 * an architecture makes a different choice, it will need further
623 * changes to the core.
624 */
625unsigned long change_prot_numa(struct vm_area_struct *vma,
626            unsigned long addr, unsigned long end)
627{
628    int nr_updated;
629    BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
630
631    nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
632    if (nr_updated)
633        count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
634
635    return nr_updated;
636}
637#else
638static unsigned long change_prot_numa(struct vm_area_struct *vma,
639            unsigned long addr, unsigned long end)
640{
641    return 0;
642}
643#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
644
645/*
646 * Walk through page tables and collect pages to be migrated.
647 *
648 * If pages found in a given range are on a set of nodes (determined by
649 * @nodes and @flags,) it's isolated and queued to the pagelist which is
650 * passed via @private.)
651 */
652static struct vm_area_struct *
653queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
654        const nodemask_t *nodes, unsigned long flags, void *private)
655{
656    int err;
657    struct vm_area_struct *first, *vma, *prev;
658
659
660    first = find_vma(mm, start);
661    if (!first)
662        return ERR_PTR(-EFAULT);
663    prev = NULL;
664    for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
665        unsigned long endvma = vma->vm_end;
666
667        if (endvma > end)
668            endvma = end;
669        if (vma->vm_start > start)
670            start = vma->vm_start;
671
672        if (!(flags & MPOL_MF_DISCONTIG_OK)) {
673            if (!vma->vm_next && vma->vm_end < end)
674                return ERR_PTR(-EFAULT);
675            if (prev && prev->vm_end < vma->vm_start)
676                return ERR_PTR(-EFAULT);
677        }
678
679        if (flags & MPOL_MF_LAZY) {
680            change_prot_numa(vma, start, endvma);
681            goto next;
682        }
683
684        if ((flags & MPOL_MF_STRICT) ||
685             ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686              vma_migratable(vma))) {
687
688            err = queue_pages_pgd_range(vma, start, endvma, nodes,
689                        flags, private);
690            if (err) {
691                first = ERR_PTR(err);
692                break;
693            }
694        }
695next:
696        prev = vma;
697    }
698    return first;
699}
700
701/*
702 * Apply policy to a single VMA
703 * This must be called with the mmap_sem held for writing.
704 */
705static int vma_replace_policy(struct vm_area_struct *vma,
706                        struct mempolicy *pol)
707{
708    int err;
709    struct mempolicy *old;
710    struct mempolicy *new;
711
712    pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
713         vma->vm_start, vma->vm_end, vma->vm_pgoff,
714         vma->vm_ops, vma->vm_file,
715         vma->vm_ops ? vma->vm_ops->set_policy : NULL);
716
717    new = mpol_dup(pol);
718    if (IS_ERR(new))
719        return PTR_ERR(new);
720
721    if (vma->vm_ops && vma->vm_ops->set_policy) {
722        err = vma->vm_ops->set_policy(vma, new);
723        if (err)
724            goto err_out;
725    }
726
727    old = vma->vm_policy;
728    vma->vm_policy = new; /* protected by mmap_sem */
729    mpol_put(old);
730
731    return 0;
732 err_out:
733    mpol_put(new);
734    return err;
735}
736
737/* Step 2: apply policy to a range and do splits. */
738static int mbind_range(struct mm_struct *mm, unsigned long start,
739               unsigned long end, struct mempolicy *new_pol)
740{
741    struct vm_area_struct *next;
742    struct vm_area_struct *prev;
743    struct vm_area_struct *vma;
744    int err = 0;
745    pgoff_t pgoff;
746    unsigned long vmstart;
747    unsigned long vmend;
748
749    vma = find_vma(mm, start);
750    if (!vma || vma->vm_start > start)
751        return -EFAULT;
752
753    prev = vma->vm_prev;
754    if (start > vma->vm_start)
755        prev = vma;
756
757    for (; vma && vma->vm_start < end; prev = vma, vma = next) {
758        next = vma->vm_next;
759        vmstart = max(start, vma->vm_start);
760        vmend = min(end, vma->vm_end);
761
762        if (mpol_equal(vma_policy(vma), new_pol))
763            continue;
764
765        pgoff = vma->vm_pgoff +
766            ((vmstart - vma->vm_start) >> PAGE_SHIFT);
767        prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
768                  vma->anon_vma, vma->vm_file, pgoff,
769                  new_pol);
770        if (prev) {
771            vma = prev;
772            next = vma->vm_next;
773            if (mpol_equal(vma_policy(vma), new_pol))
774                continue;
775            /* vma_merge() joined vma && vma->next, case 8 */
776            goto replace;
777        }
778        if (vma->vm_start != vmstart) {
779            err = split_vma(vma->vm_mm, vma, vmstart, 1);
780            if (err)
781                goto out;
782        }
783        if (vma->vm_end != vmend) {
784            err = split_vma(vma->vm_mm, vma, vmend, 0);
785            if (err)
786                goto out;
787        }
788 replace:
789        err = vma_replace_policy(vma, new_pol);
790        if (err)
791            goto out;
792    }
793
794 out:
795    return err;
796}
797
798/*
799 * Update task->flags PF_MEMPOLICY bit: set iff non-default
800 * mempolicy. Allows more rapid checking of this (combined perhaps
801 * with other PF_* flag bits) on memory allocation hot code paths.
802 *
803 * If called from outside this file, the task 'p' should -only- be
804 * a newly forked child not yet visible on the task list, because
805 * manipulating the task flags of a visible task is not safe.
806 *
807 * The above limitation is why this routine has the funny name
808 * mpol_fix_fork_child_flag().
809 *
810 * It is also safe to call this with a task pointer of current,
811 * which the static wrapper mpol_set_task_struct_flag() does,
812 * for use within this file.
813 */
814
815void mpol_fix_fork_child_flag(struct task_struct *p)
816{
817    if (p->mempolicy)
818        p->flags |= PF_MEMPOLICY;
819    else
820        p->flags &= ~PF_MEMPOLICY;
821}
822
823static void mpol_set_task_struct_flag(void)
824{
825    mpol_fix_fork_child_flag(current);
826}
827
828/* Set the process memory policy */
829static long do_set_mempolicy(unsigned short mode, unsigned short flags,
830                 nodemask_t *nodes)
831{
832    struct mempolicy *new, *old;
833    struct mm_struct *mm = current->mm;
834    NODEMASK_SCRATCH(scratch);
835    int ret;
836
837    if (!scratch)
838        return -ENOMEM;
839
840    new = mpol_new(mode, flags, nodes);
841    if (IS_ERR(new)) {
842        ret = PTR_ERR(new);
843        goto out;
844    }
845    /*
846     * prevent changing our mempolicy while show_numa_maps()
847     * is using it.
848     * Note: do_set_mempolicy() can be called at init time
849     * with no 'mm'.
850     */
851    if (mm)
852        down_write(&mm->mmap_sem);
853    task_lock(current);
854    ret = mpol_set_nodemask(new, nodes, scratch);
855    if (ret) {
856        task_unlock(current);
857        if (mm)
858            up_write(&mm->mmap_sem);
859        mpol_put(new);
860        goto out;
861    }
862    old = current->mempolicy;
863    current->mempolicy = new;
864    mpol_set_task_struct_flag();
865    if (new && new->mode == MPOL_INTERLEAVE &&
866        nodes_weight(new->v.nodes))
867        current->il_next = first_node(new->v.nodes);
868    task_unlock(current);
869    if (mm)
870        up_write(&mm->mmap_sem);
871
872    mpol_put(old);
873    ret = 0;
874out:
875    NODEMASK_SCRATCH_FREE(scratch);
876    return ret;
877}
878
879/*
880 * Return nodemask for policy for get_mempolicy() query
881 *
882 * Called with task's alloc_lock held
883 */
884static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
885{
886    nodes_clear(*nodes);
887    if (p == &default_policy)
888        return;
889
890    switch (p->mode) {
891    case MPOL_BIND:
892        /* Fall through */
893    case MPOL_INTERLEAVE:
894        *nodes = p->v.nodes;
895        break;
896    case MPOL_PREFERRED:
897        if (!(p->flags & MPOL_F_LOCAL))
898            node_set(p->v.preferred_node, *nodes);
899        /* else return empty node mask for local allocation */
900        break;
901    default:
902        BUG();
903    }
904}
905
906static int lookup_node(struct mm_struct *mm, unsigned long addr)
907{
908    struct page *p;
909    int err;
910
911    err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
912    if (err >= 0) {
913        err = page_to_nid(p);
914        put_page(p);
915    }
916    return err;
917}
918
919/* Retrieve NUMA policy */
920static long do_get_mempolicy(int *policy, nodemask_t *nmask,
921                 unsigned long addr, unsigned long flags)
922{
923    int err;
924    struct mm_struct *mm = current->mm;
925    struct vm_area_struct *vma = NULL;
926    struct mempolicy *pol = current->mempolicy;
927
928    if (flags &
929        ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
930        return -EINVAL;
931
932    if (flags & MPOL_F_MEMS_ALLOWED) {
933        if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
934            return -EINVAL;
935        *policy = 0; /* just so it's initialized */
936        task_lock(current);
937        *nmask = cpuset_current_mems_allowed;
938        task_unlock(current);
939        return 0;
940    }
941
942    if (flags & MPOL_F_ADDR) {
943        /*
944         * Do NOT fall back to task policy if the
945         * vma/shared policy at addr is NULL. We
946         * want to return MPOL_DEFAULT in this case.
947         */
948        down_read(&mm->mmap_sem);
949        vma = find_vma_intersection(mm, addr, addr+1);
950        if (!vma) {
951            up_read(&mm->mmap_sem);
952            return -EFAULT;
953        }
954        if (vma->vm_ops && vma->vm_ops->get_policy)
955            pol = vma->vm_ops->get_policy(vma, addr);
956        else
957            pol = vma->vm_policy;
958    } else if (addr)
959        return -EINVAL;
960
961    if (!pol)
962        pol = &default_policy; /* indicates default behavior */
963
964    if (flags & MPOL_F_NODE) {
965        if (flags & MPOL_F_ADDR) {
966            err = lookup_node(mm, addr);
967            if (err < 0)
968                goto out;
969            *policy = err;
970        } else if (pol == current->mempolicy &&
971                pol->mode == MPOL_INTERLEAVE) {
972            *policy = current->il_next;
973        } else {
974            err = -EINVAL;
975            goto out;
976        }
977    } else {
978        *policy = pol == &default_policy ? MPOL_DEFAULT :
979                        pol->mode;
980        /*
981         * Internal mempolicy flags must be masked off before exposing
982         * the policy to userspace.
983         */
984        *policy |= (pol->flags & MPOL_MODE_FLAGS);
985    }
986
987    if (vma) {
988        up_read(&current->mm->mmap_sem);
989        vma = NULL;
990    }
991
992    err = 0;
993    if (nmask) {
994        if (mpol_store_user_nodemask(pol)) {
995            *nmask = pol->w.user_nodemask;
996        } else {
997            task_lock(current);
998            get_policy_nodemask(pol, nmask);
999            task_unlock(current);
1000        }
1001    }
1002
1003 out:
1004    mpol_cond_put(pol);
1005    if (vma)
1006        up_read(&current->mm->mmap_sem);
1007    return err;
1008}
1009
1010#ifdef CONFIG_MIGRATION
1011/*
1012 * page migration
1013 */
1014static void migrate_page_add(struct page *page, struct list_head *pagelist,
1015                unsigned long flags)
1016{
1017    /*
1018     * Avoid migrating a page that is shared with others.
1019     */
1020    if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
1021        if (!isolate_lru_page(page)) {
1022            list_add_tail(&page->lru, pagelist);
1023            inc_zone_page_state(page, NR_ISOLATED_ANON +
1024                        page_is_file_cache(page));
1025        }
1026    }
1027}
1028
1029static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1030{
1031    if (PageHuge(page))
1032        return alloc_huge_page_node(page_hstate(compound_head(page)),
1033                    node);
1034    else
1035        return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1036}
1037
1038/*
1039 * Migrate pages from one node to a target node.
1040 * Returns error or the number of pages not migrated.
1041 */
1042static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1043               int flags)
1044{
1045    nodemask_t nmask;
1046    LIST_HEAD(pagelist);
1047    int err = 0;
1048
1049    nodes_clear(nmask);
1050    node_set(source, nmask);
1051
1052    /*
1053     * This does not "check" the range but isolates all pages that
1054     * need migration. Between passing in the full user address
1055     * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1056     */
1057    VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1058    queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1059            flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1060
1061    if (!list_empty(&pagelist)) {
1062        err = migrate_pages(&pagelist, new_node_page, dest,
1063                    MIGRATE_SYNC, MR_SYSCALL);
1064        if (err)
1065            putback_movable_pages(&pagelist);
1066    }
1067
1068    return err;
1069}
1070
1071/*
1072 * Move pages between the two nodesets so as to preserve the physical
1073 * layout as much as possible.
1074 *
1075 * Returns the number of page that could not be moved.
1076 */
1077int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1078             const nodemask_t *to, int flags)
1079{
1080    int busy = 0;
1081    int err;
1082    nodemask_t tmp;
1083
1084    err = migrate_prep();
1085    if (err)
1086        return err;
1087
1088    down_read(&mm->mmap_sem);
1089
1090    err = migrate_vmas(mm, from, to, flags);
1091    if (err)
1092        goto out;
1093
1094    /*
1095     * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1096     * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
1097     * bit in 'tmp', and return that <source, dest> pair for migration.
1098     * The pair of nodemasks 'to' and 'from' define the map.
1099     *
1100     * If no pair of bits is found that way, fallback to picking some
1101     * pair of 'source' and 'dest' bits that are not the same. If the
1102     * 'source' and 'dest' bits are the same, this represents a node
1103     * that will be migrating to itself, so no pages need move.
1104     *
1105     * If no bits are left in 'tmp', or if all remaining bits left
1106     * in 'tmp' correspond to the same bit in 'to', return false
1107     * (nothing left to migrate).
1108     *
1109     * This lets us pick a pair of nodes to migrate between, such that
1110     * if possible the dest node is not already occupied by some other
1111     * source node, minimizing the risk of overloading the memory on a
1112     * node that would happen if we migrated incoming memory to a node
1113     * before migrating outgoing memory source that same node.
1114     *
1115     * A single scan of tmp is sufficient. As we go, we remember the
1116     * most recent <s, d> pair that moved (s != d). If we find a pair
1117     * that not only moved, but what's better, moved to an empty slot
1118     * (d is not set in tmp), then we break out then, with that pair.
1119     * Otherwise when we finish scanning from_tmp, we at least have the
1120     * most recent <s, d> pair that moved. If we get all the way through
1121     * the scan of tmp without finding any node that moved, much less
1122     * moved to an empty node, then there is nothing left worth migrating.
1123     */
1124
1125    tmp = *from;
1126    while (!nodes_empty(tmp)) {
1127        int s,d;
1128        int source = -1;
1129        int dest = 0;
1130
1131        for_each_node_mask(s, tmp) {
1132
1133            /*
1134             * do_migrate_pages() tries to maintain the relative
1135             * node relationship of the pages established between
1136             * threads and memory areas.
1137                         *
1138             * However if the number of source nodes is not equal to
1139             * the number of destination nodes we can not preserve
1140             * this node relative relationship. In that case, skip
1141             * copying memory from a node that is in the destination
1142             * mask.
1143             *
1144             * Example: [2,3,4] -> [3,4,5] moves everything.
1145             * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1146             */
1147
1148            if ((nodes_weight(*from) != nodes_weight(*to)) &&
1149                        (node_isset(s, *to)))
1150                continue;
1151
1152            d = node_remap(s, *from, *to);
1153            if (s == d)
1154                continue;
1155
1156            source = s; /* Node moved. Memorize */
1157            dest = d;
1158
1159            /* dest not in remaining from nodes? */
1160            if (!node_isset(dest, tmp))
1161                break;
1162        }
1163        if (source == -1)
1164            break;
1165
1166        node_clear(source, tmp);
1167        err = migrate_to_node(mm, source, dest, flags);
1168        if (err > 0)
1169            busy += err;
1170        if (err < 0)
1171            break;
1172    }
1173out:
1174    up_read(&mm->mmap_sem);
1175    if (err < 0)
1176        return err;
1177    return busy;
1178
1179}
1180
1181/*
1182 * Allocate a new page for page migration based on vma policy.
1183 * Start assuming that page is mapped by vma pointed to by @private.
1184 * Search forward from there, if not. N.B., this assumes that the
1185 * list of pages handed to migrate_pages()--which is how we get here--
1186 * is in virtual address order.
1187 */
1188static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1189{
1190    struct vm_area_struct *vma = (struct vm_area_struct *)private;
1191    unsigned long uninitialized_var(address);
1192
1193    while (vma) {
1194        address = page_address_in_vma(page, vma);
1195        if (address != -EFAULT)
1196            break;
1197        vma = vma->vm_next;
1198    }
1199    /*
1200     * queue_pages_range() confirms that @page belongs to some vma,
1201     * so vma shouldn't be NULL.
1202     */
1203    BUG_ON(!vma);
1204
1205    if (PageHuge(page))
1206        return alloc_huge_page_noerr(vma, address, 1);
1207    return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1208}
1209#else
1210
1211static void migrate_page_add(struct page *page, struct list_head *pagelist,
1212                unsigned long flags)
1213{
1214}
1215
1216int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1217             const nodemask_t *to, int flags)
1218{
1219    return -ENOSYS;
1220}
1221
1222static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1223{
1224    return NULL;
1225}
1226#endif
1227
1228static long do_mbind(unsigned long start, unsigned long len,
1229             unsigned short mode, unsigned short mode_flags,
1230             nodemask_t *nmask, unsigned long flags)
1231{
1232    struct vm_area_struct *vma;
1233    struct mm_struct *mm = current->mm;
1234    struct mempolicy *new;
1235    unsigned long end;
1236    int err;
1237    LIST_HEAD(pagelist);
1238
1239    if (flags & ~(unsigned long)MPOL_MF_VALID)
1240        return -EINVAL;
1241    if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1242        return -EPERM;
1243
1244    if (start & ~PAGE_MASK)
1245        return -EINVAL;
1246
1247    if (mode == MPOL_DEFAULT)
1248        flags &= ~MPOL_MF_STRICT;
1249
1250    len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1251    end = start + len;
1252
1253    if (end < start)
1254        return -EINVAL;
1255    if (end == start)
1256        return 0;
1257
1258    new = mpol_new(mode, mode_flags, nmask);
1259    if (IS_ERR(new))
1260        return PTR_ERR(new);
1261
1262    if (flags & MPOL_MF_LAZY)
1263        new->flags |= MPOL_F_MOF;
1264
1265    /*
1266     * If we are using the default policy then operation
1267     * on discontinuous address spaces is okay after all
1268     */
1269    if (!new)
1270        flags |= MPOL_MF_DISCONTIG_OK;
1271
1272    pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1273         start, start + len, mode, mode_flags,
1274         nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1275
1276    if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1277
1278        err = migrate_prep();
1279        if (err)
1280            goto mpol_out;
1281    }
1282    {
1283        NODEMASK_SCRATCH(scratch);
1284        if (scratch) {
1285            down_write(&mm->mmap_sem);
1286            task_lock(current);
1287            err = mpol_set_nodemask(new, nmask, scratch);
1288            task_unlock(current);
1289            if (err)
1290                up_write(&mm->mmap_sem);
1291        } else
1292            err = -ENOMEM;
1293        NODEMASK_SCRATCH_FREE(scratch);
1294    }
1295    if (err)
1296        goto mpol_out;
1297
1298    vma = queue_pages_range(mm, start, end, nmask,
1299              flags | MPOL_MF_INVERT, &pagelist);
1300
1301    err = PTR_ERR(vma); /* maybe ... */
1302    if (!IS_ERR(vma))
1303        err = mbind_range(mm, start, end, new);
1304
1305    if (!err) {
1306        int nr_failed = 0;
1307
1308        if (!list_empty(&pagelist)) {
1309            WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1310            nr_failed = migrate_pages(&pagelist, new_vma_page,
1311                    (unsigned long)vma,
1312                    MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1313            if (nr_failed)
1314                putback_movable_pages(&pagelist);
1315        }
1316
1317        if (nr_failed && (flags & MPOL_MF_STRICT))
1318            err = -EIO;
1319    } else
1320        putback_lru_pages(&pagelist);
1321
1322    up_write(&mm->mmap_sem);
1323 mpol_out:
1324    mpol_put(new);
1325    return err;
1326}
1327
1328/*
1329 * User space interface with variable sized bitmaps for nodelists.
1330 */
1331
1332/* Copy a node mask from user space. */
1333static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1334             unsigned long maxnode)
1335{
1336    unsigned long k;
1337    unsigned long nlongs;
1338    unsigned long endmask;
1339
1340    --maxnode;
1341    nodes_clear(*nodes);
1342    if (maxnode == 0 || !nmask)
1343        return 0;
1344    if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1345        return -EINVAL;
1346
1347    nlongs = BITS_TO_LONGS(maxnode);
1348    if ((maxnode % BITS_PER_LONG) == 0)
1349        endmask = ~0UL;
1350    else
1351        endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1352
1353    /* When the user specified more nodes than supported just check
1354       if the non supported part is all zero. */
1355    if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1356        if (nlongs > PAGE_SIZE/sizeof(long))
1357            return -EINVAL;
1358        for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1359            unsigned long t;
1360            if (get_user(t, nmask + k))
1361                return -EFAULT;
1362            if (k == nlongs - 1) {
1363                if (t & endmask)
1364                    return -EINVAL;
1365            } else if (t)
1366                return -EINVAL;
1367        }
1368        nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1369        endmask = ~0UL;
1370    }
1371
1372    if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1373        return -EFAULT;
1374    nodes_addr(*nodes)[nlongs-1] &= endmask;
1375    return 0;
1376}
1377
1378/* Copy a kernel node mask to user space */
1379static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1380                  nodemask_t *nodes)
1381{
1382    unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1383    const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1384
1385    if (copy > nbytes) {
1386        if (copy > PAGE_SIZE)
1387            return -EINVAL;
1388        if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1389            return -EFAULT;
1390        copy = nbytes;
1391    }
1392    return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1393}
1394
1395SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1396        unsigned long, mode, unsigned long __user *, nmask,
1397        unsigned long, maxnode, unsigned, flags)
1398{
1399    nodemask_t nodes;
1400    int err;
1401    unsigned short mode_flags;
1402
1403    mode_flags = mode & MPOL_MODE_FLAGS;
1404    mode &= ~MPOL_MODE_FLAGS;
1405    if (mode >= MPOL_MAX)
1406        return -EINVAL;
1407    if ((mode_flags & MPOL_F_STATIC_NODES) &&
1408        (mode_flags & MPOL_F_RELATIVE_NODES))
1409        return -EINVAL;
1410    err = get_nodes(&nodes, nmask, maxnode);
1411    if (err)
1412        return err;
1413    return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1414}
1415
1416/* Set the process memory policy */
1417SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1418        unsigned long, maxnode)
1419{
1420    int err;
1421    nodemask_t nodes;
1422    unsigned short flags;
1423
1424    flags = mode & MPOL_MODE_FLAGS;
1425    mode &= ~MPOL_MODE_FLAGS;
1426    if ((unsigned int)mode >= MPOL_MAX)
1427        return -EINVAL;
1428    if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1429        return -EINVAL;
1430    err = get_nodes(&nodes, nmask, maxnode);
1431    if (err)
1432        return err;
1433    return do_set_mempolicy(mode, flags, &nodes);
1434}
1435
1436SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1437        const unsigned long __user *, old_nodes,
1438        const unsigned long __user *, new_nodes)
1439{
1440    const struct cred *cred = current_cred(), *tcred;
1441    struct mm_struct *mm = NULL;
1442    struct task_struct *task;
1443    nodemask_t task_nodes;
1444    int err;
1445    nodemask_t *old;
1446    nodemask_t *new;
1447    NODEMASK_SCRATCH(scratch);
1448
1449    if (!scratch)
1450        return -ENOMEM;
1451
1452    old = &scratch->mask1;
1453    new = &scratch->mask2;
1454
1455    err = get_nodes(old, old_nodes, maxnode);
1456    if (err)
1457        goto out;
1458
1459    err = get_nodes(new, new_nodes, maxnode);
1460    if (err)
1461        goto out;
1462
1463    /* Find the mm_struct */
1464    rcu_read_lock();
1465    task = pid ? find_task_by_vpid(pid) : current;
1466    if (!task) {
1467        rcu_read_unlock();
1468        err = -ESRCH;
1469        goto out;
1470    }
1471    get_task_struct(task);
1472
1473    err = -EINVAL;
1474
1475    /*
1476     * Check if this process has the right to modify the specified
1477     * process. The right exists if the process has administrative
1478     * capabilities, superuser privileges or the same
1479     * userid as the target process.
1480     */
1481    tcred = __task_cred(task);
1482    if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1483        !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
1484        !capable(CAP_SYS_NICE)) {
1485        rcu_read_unlock();
1486        err = -EPERM;
1487        goto out_put;
1488    }
1489    rcu_read_unlock();
1490
1491    task_nodes = cpuset_mems_allowed(task);
1492    /* Is the user allowed to access the target nodes? */
1493    if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1494        err = -EPERM;
1495        goto out_put;
1496    }
1497
1498    if (!nodes_subset(*new, node_states[N_MEMORY])) {
1499        err = -EINVAL;
1500        goto out_put;
1501    }
1502
1503    err = security_task_movememory(task);
1504    if (err)
1505        goto out_put;
1506
1507    mm = get_task_mm(task);
1508    put_task_struct(task);
1509
1510    if (!mm) {
1511        err = -EINVAL;
1512        goto out;
1513    }
1514
1515    err = do_migrate_pages(mm, old, new,
1516        capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1517
1518    mmput(mm);
1519out:
1520    NODEMASK_SCRATCH_FREE(scratch);
1521
1522    return err;
1523
1524out_put:
1525    put_task_struct(task);
1526    goto out;
1527
1528}
1529
1530
1531/* Retrieve NUMA policy */
1532SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1533        unsigned long __user *, nmask, unsigned long, maxnode,
1534        unsigned long, addr, unsigned long, flags)
1535{
1536    int err;
1537    int uninitialized_var(pval);
1538    nodemask_t nodes;
1539
1540    if (nmask != NULL && maxnode < MAX_NUMNODES)
1541        return -EINVAL;
1542
1543    err = do_get_mempolicy(&pval, &nodes, addr, flags);
1544
1545    if (err)
1546        return err;
1547
1548    if (policy && put_user(pval, policy))
1549        return -EFAULT;
1550
1551    if (nmask)
1552        err = copy_nodes_to_user(nmask, maxnode, &nodes);
1553
1554    return err;
1555}
1556
1557#ifdef CONFIG_COMPAT
1558
1559asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1560                     compat_ulong_t __user *nmask,
1561                     compat_ulong_t maxnode,
1562                     compat_ulong_t addr, compat_ulong_t flags)
1563{
1564    long err;
1565    unsigned long __user *nm = NULL;
1566    unsigned long nr_bits, alloc_size;
1567    DECLARE_BITMAP(bm, MAX_NUMNODES);
1568
1569    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1570    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1571
1572    if (nmask)
1573        nm = compat_alloc_user_space(alloc_size);
1574
1575    err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1576
1577    if (!err && nmask) {
1578        unsigned long copy_size;
1579        copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1580        err = copy_from_user(bm, nm, copy_size);
1581        /* ensure entire bitmap is zeroed */
1582        err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1583        err |= compat_put_bitmap(nmask, bm, nr_bits);
1584    }
1585
1586    return err;
1587}
1588
1589asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1590                     compat_ulong_t maxnode)
1591{
1592    long err = 0;
1593    unsigned long __user *nm = NULL;
1594    unsigned long nr_bits, alloc_size;
1595    DECLARE_BITMAP(bm, MAX_NUMNODES);
1596
1597    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1598    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1599
1600    if (nmask) {
1601        err = compat_get_bitmap(bm, nmask, nr_bits);
1602        nm = compat_alloc_user_space(alloc_size);
1603        err |= copy_to_user(nm, bm, alloc_size);
1604    }
1605
1606    if (err)
1607        return -EFAULT;
1608
1609    return sys_set_mempolicy(mode, nm, nr_bits+1);
1610}
1611
1612asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1613                 compat_ulong_t mode, compat_ulong_t __user *nmask,
1614                 compat_ulong_t maxnode, compat_ulong_t flags)
1615{
1616    long err = 0;
1617    unsigned long __user *nm = NULL;
1618    unsigned long nr_bits, alloc_size;
1619    nodemask_t bm;
1620
1621    nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1622    alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1623
1624    if (nmask) {
1625        err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1626        nm = compat_alloc_user_space(alloc_size);
1627        err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1628    }
1629
1630    if (err)
1631        return -EFAULT;
1632
1633    return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1634}
1635
1636#endif
1637
1638/*
1639 * get_vma_policy(@task, @vma, @addr)
1640 * @task - task for fallback if vma policy == default
1641 * @vma - virtual memory area whose policy is sought
1642 * @addr - address in @vma for shared policy lookup
1643 *
1644 * Returns effective policy for a VMA at specified address.
1645 * Falls back to @task or system default policy, as necessary.
1646 * Current or other task's task mempolicy and non-shared vma policies must be
1647 * protected by task_lock(task) by the caller.
1648 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1649 * count--added by the get_policy() vm_op, as appropriate--to protect against
1650 * freeing by another task. It is the caller's responsibility to free the
1651 * extra reference for shared policies.
1652 */
1653struct mempolicy *get_vma_policy(struct task_struct *task,
1654        struct vm_area_struct *vma, unsigned long addr)
1655{
1656    struct mempolicy *pol = get_task_policy(task);
1657
1658    if (vma) {
1659        if (vma->vm_ops && vma->vm_ops->get_policy) {
1660            struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1661                                    addr);
1662            if (vpol)
1663                pol = vpol;
1664        } else if (vma->vm_policy) {
1665            pol = vma->vm_policy;
1666
1667            /*
1668             * shmem_alloc_page() passes MPOL_F_SHARED policy with
1669             * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1670             * count on these policies which will be dropped by
1671             * mpol_cond_put() later
1672             */
1673            if (mpol_needs_cond_ref(pol))
1674                mpol_get(pol);
1675        }
1676    }
1677    if (!pol)
1678        pol = &default_policy;
1679    return pol;
1680}
1681
1682static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1683{
1684    enum zone_type dynamic_policy_zone = policy_zone;
1685
1686    BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1687
1688    /*
1689     * if policy->v.nodes has movable memory only,
1690     * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1691     *
1692     * policy->v.nodes is intersect with node_states[N_MEMORY].
1693     * so if the following test faile, it implies
1694     * policy->v.nodes has movable memory only.
1695     */
1696    if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1697        dynamic_policy_zone = ZONE_MOVABLE;
1698
1699    return zone >= dynamic_policy_zone;
1700}
1701
1702/*
1703 * Return a nodemask representing a mempolicy for filtering nodes for
1704 * page allocation
1705 */
1706static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1707{
1708    /* Lower zones don't get a nodemask applied for MPOL_BIND */
1709    if (unlikely(policy->mode == MPOL_BIND) &&
1710            apply_policy_zone(policy, gfp_zone(gfp)) &&
1711            cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1712        return &policy->v.nodes;
1713
1714    return NULL;
1715}
1716
1717/* Return a zonelist indicated by gfp for node representing a mempolicy */
1718static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1719    int nd)
1720{
1721    switch (policy->mode) {
1722    case MPOL_PREFERRED:
1723        if (!(policy->flags & MPOL_F_LOCAL))
1724            nd = policy->v.preferred_node;
1725        break;
1726    case MPOL_BIND:
1727        /*
1728         * Normally, MPOL_BIND allocations are node-local within the
1729         * allowed nodemask. However, if __GFP_THISNODE is set and the
1730         * current node isn't part of the mask, we use the zonelist for
1731         * the first node in the mask instead.
1732         */
1733        if (unlikely(gfp & __GFP_THISNODE) &&
1734                unlikely(!node_isset(nd, policy->v.nodes)))
1735            nd = first_node(policy->v.nodes);
1736        break;
1737    default:
1738        BUG();
1739    }
1740    return node_zonelist(nd, gfp);
1741}
1742
1743/* Do dynamic interleaving for a process */
1744static unsigned interleave_nodes(struct mempolicy *policy)
1745{
1746    unsigned nid, next;
1747    struct task_struct *me = current;
1748
1749    nid = me->il_next;
1750    next = next_node(nid, policy->v.nodes);
1751    if (next >= MAX_NUMNODES)
1752        next = first_node(policy->v.nodes);
1753    if (next < MAX_NUMNODES)
1754        me->il_next = next;
1755    return nid;
1756}
1757
1758/*
1759 * Depending on the memory policy provide a node from which to allocate the
1760 * next slab entry.
1761 * @policy must be protected by freeing by the caller. If @policy is
1762 * the current task's mempolicy, this protection is implicit, as only the
1763 * task can change it's policy. The system default policy requires no
1764 * such protection.
1765 */
1766unsigned slab_node(void)
1767{
1768    struct mempolicy *policy;
1769
1770    if (in_interrupt())
1771        return numa_node_id();
1772
1773    policy = current->mempolicy;
1774    if (!policy || policy->flags & MPOL_F_LOCAL)
1775        return numa_node_id();
1776
1777    switch (policy->mode) {
1778    case MPOL_PREFERRED:
1779        /*
1780         * handled MPOL_F_LOCAL above
1781         */
1782        return policy->v.preferred_node;
1783
1784    case MPOL_INTERLEAVE:
1785        return interleave_nodes(policy);
1786
1787    case MPOL_BIND: {
1788        /*
1789         * Follow bind policy behavior and start allocation at the
1790         * first node.
1791         */
1792        struct zonelist *zonelist;
1793        struct zone *zone;
1794        enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1795        zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1796        (void)first_zones_zonelist(zonelist, highest_zoneidx,
1797                            &policy->v.nodes,
1798                            &zone);
1799        return zone ? zone->node : numa_node_id();
1800    }
1801
1802    default:
1803        BUG();
1804    }
1805}
1806
1807/* Do static interleaving for a VMA with known offset. */
1808static unsigned offset_il_node(struct mempolicy *pol,
1809        struct vm_area_struct *vma, unsigned long off)
1810{
1811    unsigned nnodes = nodes_weight(pol->v.nodes);
1812    unsigned target;
1813    int c;
1814    int nid = -1;
1815
1816    if (!nnodes)
1817        return numa_node_id();
1818    target = (unsigned int)off % nnodes;
1819    c = 0;
1820    do {
1821        nid = next_node(nid, pol->v.nodes);
1822        c++;
1823    } while (c <= target);
1824    return nid;
1825}
1826
1827/* Determine a node number for interleave */
1828static inline unsigned interleave_nid(struct mempolicy *pol,
1829         struct vm_area_struct *vma, unsigned long addr, int shift)
1830{
1831    if (vma) {
1832        unsigned long off;
1833
1834        /*
1835         * for small pages, there is no difference between
1836         * shift and PAGE_SHIFT, so the bit-shift is safe.
1837         * for huge pages, since vm_pgoff is in units of small
1838         * pages, we need to shift off the always 0 bits to get
1839         * a useful offset.
1840         */
1841        BUG_ON(shift < PAGE_SHIFT);
1842        off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1843        off += (addr - vma->vm_start) >> shift;
1844        return offset_il_node(pol, vma, off);
1845    } else
1846        return interleave_nodes(pol);
1847}
1848
1849/*
1850 * Return the bit number of a random bit set in the nodemask.
1851 * (returns -1 if nodemask is empty)
1852 */
1853int node_random(const nodemask_t *maskp)
1854{
1855    int w, bit = -1;
1856
1857    w = nodes_weight(*maskp);
1858    if (w)
1859        bit = bitmap_ord_to_pos(maskp->bits,
1860            get_random_int() % w, MAX_NUMNODES);
1861    return bit;
1862}
1863
1864#ifdef CONFIG_HUGETLBFS
1865/*
1866 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1867 * @vma = virtual memory area whose policy is sought
1868 * @addr = address in @vma for shared policy lookup and interleave policy
1869 * @gfp_flags = for requested zone
1870 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1871 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1872 *
1873 * Returns a zonelist suitable for a huge page allocation and a pointer
1874 * to the struct mempolicy for conditional unref after allocation.
1875 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1876 * @nodemask for filtering the zonelist.
1877 *
1878 * Must be protected by get_mems_allowed()
1879 */
1880struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1881                gfp_t gfp_flags, struct mempolicy **mpol,
1882                nodemask_t **nodemask)
1883{
1884    struct zonelist *zl;
1885
1886    *mpol = get_vma_policy(current, vma, addr);
1887    *nodemask = NULL; /* assume !MPOL_BIND */
1888
1889    if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1890        zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1891                huge_page_shift(hstate_vma(vma))), gfp_flags);
1892    } else {
1893        zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1894        if ((*mpol)->mode == MPOL_BIND)
1895            *nodemask = &(*mpol)->v.nodes;
1896    }
1897    return zl;
1898}
1899
1900/*
1901 * init_nodemask_of_mempolicy
1902 *
1903 * If the current task's mempolicy is "default" [NULL], return 'false'
1904 * to indicate default policy. Otherwise, extract the policy nodemask
1905 * for 'bind' or 'interleave' policy into the argument nodemask, or
1906 * initialize the argument nodemask to contain the single node for
1907 * 'preferred' or 'local' policy and return 'true' to indicate presence
1908 * of non-default mempolicy.
1909 *
1910 * We don't bother with reference counting the mempolicy [mpol_get/put]
1911 * because the current task is examining it's own mempolicy and a task's
1912 * mempolicy is only ever changed by the task itself.
1913 *
1914 * N.B., it is the caller's responsibility to free a returned nodemask.
1915 */
1916bool init_nodemask_of_mempolicy(nodemask_t *mask)
1917{
1918    struct mempolicy *mempolicy;
1919    int nid;
1920
1921    if (!(mask && current->mempolicy))
1922        return false;
1923
1924    task_lock(current);
1925    mempolicy = current->mempolicy;
1926    switch (mempolicy->mode) {
1927    case MPOL_PREFERRED:
1928        if (mempolicy->flags & MPOL_F_LOCAL)
1929            nid = numa_node_id();
1930        else
1931            nid = mempolicy->v.preferred_node;
1932        init_nodemask_of_node(mask, nid);
1933        break;
1934
1935    case MPOL_BIND:
1936        /* Fall through */
1937    case MPOL_INTERLEAVE:
1938        *mask = mempolicy->v.nodes;
1939        break;
1940
1941    default:
1942        BUG();
1943    }
1944    task_unlock(current);
1945
1946    return true;
1947}
1948#endif
1949
1950/*
1951 * mempolicy_nodemask_intersects
1952 *
1953 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1954 * policy. Otherwise, check for intersection between mask and the policy
1955 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1956 * policy, always return true since it may allocate elsewhere on fallback.
1957 *
1958 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1959 */
1960bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1961                    const nodemask_t *mask)
1962{
1963    struct mempolicy *mempolicy;
1964    bool ret = true;
1965
1966    if (!mask)
1967        return ret;
1968    task_lock(tsk);
1969    mempolicy = tsk->mempolicy;
1970    if (!mempolicy)
1971        goto out;
1972
1973    switch (mempolicy->mode) {
1974    case MPOL_PREFERRED:
1975        /*
1976         * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1977         * allocate from, they may fallback to other nodes when oom.
1978         * Thus, it's possible for tsk to have allocated memory from
1979         * nodes in mask.
1980         */
1981        break;
1982    case MPOL_BIND:
1983    case MPOL_INTERLEAVE:
1984        ret = nodes_intersects(mempolicy->v.nodes, *mask);
1985        break;
1986    default:
1987        BUG();
1988    }
1989out:
1990    task_unlock(tsk);
1991    return ret;
1992}
1993
1994/* Allocate a page in interleaved policy.
1995   Own path because it needs to do special accounting. */
1996static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1997                    unsigned nid)
1998{
1999    struct zonelist *zl;
2000    struct page *page;
2001
2002    zl = node_zonelist(nid, gfp);
2003    page = __alloc_pages(gfp, order, zl);
2004    if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
2005        inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
2006    return page;
2007}
2008
2009/**
2010 * alloc_pages_vma - Allocate a page for a VMA.
2011 *
2012 * @gfp:
2013 * %GFP_USER user allocation.
2014 * %GFP_KERNEL kernel allocations,
2015 * %GFP_HIGHMEM highmem/user allocations,
2016 * %GFP_FS allocation should not call back into a file system.
2017 * %GFP_ATOMIC don't sleep.
2018 *
2019 * @order:Order of the GFP allocation.
2020 * @vma: Pointer to VMA or NULL if not available.
2021 * @addr: Virtual Address of the allocation. Must be inside the VMA.
2022 *
2023 * This function allocates a page from the kernel page pool and applies
2024 * a NUMA policy associated with the VMA or the current process.
2025 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
2026 * mm_struct of the VMA to prevent it from going away. Should be used for
2027 * all allocations for pages that will be mapped into
2028 * user space. Returns NULL when no page can be allocated.
2029 *
2030 * Should be called with the mm_sem of the vma hold.
2031 */
2032struct page *
2033alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2034        unsigned long addr, int node)
2035{
2036    struct mempolicy *pol;
2037    struct page *page;
2038    unsigned int cpuset_mems_cookie;
2039
2040retry_cpuset:
2041    pol = get_vma_policy(current, vma, addr);
2042    cpuset_mems_cookie = get_mems_allowed();
2043
2044    if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2045        unsigned nid;
2046
2047        nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2048        mpol_cond_put(pol);
2049        page = alloc_page_interleave(gfp, order, nid);
2050        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2051            goto retry_cpuset;
2052
2053        return page;
2054    }
2055    page = __alloc_pages_nodemask(gfp, order,
2056                      policy_zonelist(gfp, pol, node),
2057                      policy_nodemask(gfp, pol));
2058    if (unlikely(mpol_needs_cond_ref(pol)))
2059        __mpol_put(pol);
2060    if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2061        goto retry_cpuset;
2062    return page;
2063}
2064
2065/**
2066 * alloc_pages_current - Allocate pages.
2067 *
2068 * @gfp:
2069 * %GFP_USER user allocation,
2070 * %GFP_KERNEL kernel allocation,
2071 * %GFP_HIGHMEM highmem allocation,
2072 * %GFP_FS don't call back into a file system.
2073 * %GFP_ATOMIC don't sleep.
2074 * @order: Power of two of allocation size in pages. 0 is a single page.
2075 *
2076 * Allocate a page from the kernel page pool. When not in
2077 * interrupt context and apply the current process NUMA policy.
2078 * Returns NULL when no page can be allocated.
2079 *
2080 * Don't call cpuset_update_task_memory_state() unless
2081 * 1) it's ok to take cpuset_sem (can WAIT), and
2082 * 2) allocating for current task (not interrupt).
2083 */
2084struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2085{
2086    struct mempolicy *pol = get_task_policy(current);
2087    struct page *page;
2088    unsigned int cpuset_mems_cookie;
2089
2090    if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2091        pol = &default_policy;
2092
2093retry_cpuset:
2094    cpuset_mems_cookie = get_mems_allowed();
2095
2096    /*
2097     * No reference counting needed for current->mempolicy
2098     * nor system default_policy
2099     */
2100    if (pol->mode == MPOL_INTERLEAVE)
2101        page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2102    else
2103        page = __alloc_pages_nodemask(gfp, order,
2104                policy_zonelist(gfp, pol, numa_node_id()),
2105                policy_nodemask(gfp, pol));
2106
2107    if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2108        goto retry_cpuset;
2109
2110    return page;
2111}
2112EXPORT_SYMBOL(alloc_pages_current);
2113
2114int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2115{
2116    struct mempolicy *pol = mpol_dup(vma_policy(src));
2117
2118    if (IS_ERR(pol))
2119        return PTR_ERR(pol);
2120    dst->vm_policy = pol;
2121    return 0;
2122}
2123
2124/*
2125 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2126 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2127 * with the mems_allowed returned by cpuset_mems_allowed(). This
2128 * keeps mempolicies cpuset relative after its cpuset moves. See
2129 * further kernel/cpuset.c update_nodemask().
2130 *
2131 * current's mempolicy may be rebinded by the other task(the task that changes
2132 * cpuset's mems), so we needn't do rebind work for current task.
2133 */
2134
2135/* Slow path of a mempolicy duplicate */
2136struct mempolicy *__mpol_dup(struct mempolicy *old)
2137{
2138    struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2139
2140    if (!new)
2141        return ERR_PTR(-ENOMEM);
2142
2143    /* task's mempolicy is protected by alloc_lock */
2144    if (old == current->mempolicy) {
2145        task_lock(current);
2146        *new = *old;
2147        task_unlock(current);
2148    } else
2149        *new = *old;
2150
2151    rcu_read_lock();
2152    if (current_cpuset_is_being_rebound()) {
2153        nodemask_t mems = cpuset_mems_allowed(current);
2154        if (new->flags & MPOL_F_REBINDING)
2155            mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2156        else
2157            mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2158    }
2159    rcu_read_unlock();
2160    atomic_set(&new->refcnt, 1);
2161    return new;
2162}
2163
2164/* Slow path of a mempolicy comparison */
2165bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2166{
2167    if (!a || !b)
2168        return false;
2169    if (a->mode != b->mode)
2170        return false;
2171    if (a->flags != b->flags)
2172        return false;
2173    if (mpol_store_user_nodemask(a))
2174        if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2175            return false;
2176
2177    switch (a->mode) {
2178    case MPOL_BIND:
2179        /* Fall through */
2180    case MPOL_INTERLEAVE:
2181        return !!nodes_equal(a->v.nodes, b->v.nodes);
2182    case MPOL_PREFERRED:
2183        return a->v.preferred_node == b->v.preferred_node;
2184    default:
2185        BUG();
2186        return false;
2187    }
2188}
2189
2190/*
2191 * Shared memory backing store policy support.
2192 *
2193 * Remember policies even when nobody has shared memory mapped.
2194 * The policies are kept in Red-Black tree linked from the inode.
2195 * They are protected by the sp->lock spinlock, which should be held
2196 * for any accesses to the tree.
2197 */
2198
2199/* lookup first element intersecting start-end */
2200/* Caller holds sp->lock */
2201static struct sp_node *
2202sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2203{
2204    struct rb_node *n = sp->root.rb_node;
2205
2206    while (n) {
2207        struct sp_node *p = rb_entry(n, struct sp_node, nd);
2208
2209        if (start >= p->end)
2210            n = n->rb_right;
2211        else if (end <= p->start)
2212            n = n->rb_left;
2213        else
2214            break;
2215    }
2216    if (!n)
2217        return NULL;
2218    for (;;) {
2219        struct sp_node *w = NULL;
2220        struct rb_node *prev = rb_prev(n);
2221        if (!prev)
2222            break;
2223        w = rb_entry(prev, struct sp_node, nd);
2224        if (w->end <= start)
2225            break;
2226        n = prev;
2227    }
2228    return rb_entry(n, struct sp_node, nd);
2229}
2230
2231/* Insert a new shared policy into the list. */
2232/* Caller holds sp->lock */
2233static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2234{
2235    struct rb_node **p = &sp->root.rb_node;
2236    struct rb_node *parent = NULL;
2237    struct sp_node *nd;
2238
2239    while (*p) {
2240        parent = *p;
2241        nd = rb_entry(parent, struct sp_node, nd);
2242        if (new->start < nd->start)
2243            p = &(*p)->rb_left;
2244        else if (new->end > nd->end)
2245            p = &(*p)->rb_right;
2246        else
2247            BUG();
2248    }
2249    rb_link_node(&new->nd, parent, p);
2250    rb_insert_color(&new->nd, &sp->root);
2251    pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2252         new->policy ? new->policy->mode : 0);
2253}
2254
2255/* Find shared policy intersecting idx */
2256struct mempolicy *
2257mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2258{
2259    struct mempolicy *pol = NULL;
2260    struct sp_node *sn;
2261
2262    if (!sp->root.rb_node)
2263        return NULL;
2264    spin_lock(&sp->lock);
2265    sn = sp_lookup(sp, idx, idx+1);
2266    if (sn) {
2267        mpol_get(sn->policy);
2268        pol = sn->policy;
2269    }
2270    spin_unlock(&sp->lock);
2271    return pol;
2272}
2273
2274static void sp_free(struct sp_node *n)
2275{
2276    mpol_put(n->policy);
2277    kmem_cache_free(sn_cache, n);
2278}
2279
2280/**
2281 * mpol_misplaced - check whether current page node is valid in policy
2282 *
2283 * @page - page to be checked
2284 * @vma - vm area where page mapped
2285 * @addr - virtual address where page mapped
2286 *
2287 * Lookup current policy node id for vma,addr and "compare to" page's
2288 * node id.
2289 *
2290 * Returns:
2291 * -1 - not misplaced, page is in the right node
2292 * node - node id where the page should be
2293 *
2294 * Policy determination "mimics" alloc_page_vma().
2295 * Called from fault path where we know the vma and faulting address.
2296 */
2297int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2298{
2299    struct mempolicy *pol;
2300    struct zone *zone;
2301    int curnid = page_to_nid(page);
2302    unsigned long pgoff;
2303    int polnid = -1;
2304    int ret = -1;
2305
2306    BUG_ON(!vma);
2307
2308    pol = get_vma_policy(current, vma, addr);
2309    if (!(pol->flags & MPOL_F_MOF))
2310        goto out;
2311
2312    switch (pol->mode) {
2313    case MPOL_INTERLEAVE:
2314        BUG_ON(addr >= vma->vm_end);
2315        BUG_ON(addr < vma->vm_start);
2316
2317        pgoff = vma->vm_pgoff;
2318        pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2319        polnid = offset_il_node(pol, vma, pgoff);
2320        break;
2321
2322    case MPOL_PREFERRED:
2323        if (pol->flags & MPOL_F_LOCAL)
2324            polnid = numa_node_id();
2325        else
2326            polnid = pol->v.preferred_node;
2327        break;
2328
2329    case MPOL_BIND:
2330        /*
2331         * allows binding to multiple nodes.
2332         * use current page if in policy nodemask,
2333         * else select nearest allowed node, if any.
2334         * If no allowed nodes, use current [!misplaced].
2335         */
2336        if (node_isset(curnid, pol->v.nodes))
2337            goto out;
2338        (void)first_zones_zonelist(
2339                node_zonelist(numa_node_id(), GFP_HIGHUSER),
2340                gfp_zone(GFP_HIGHUSER),
2341                &pol->v.nodes, &zone);
2342        polnid = zone->node;
2343        break;
2344
2345    default:
2346        BUG();
2347    }
2348
2349    /* Migrate the page towards the node whose CPU is referencing it */
2350    if (pol->flags & MPOL_F_MORON) {
2351        int last_nid;
2352
2353        polnid = numa_node_id();
2354
2355        /*
2356         * Multi-stage node selection is used in conjunction
2357         * with a periodic migration fault to build a temporal
2358         * task<->page relation. By using a two-stage filter we
2359         * remove short/unlikely relations.
2360         *
2361         * Using P(p) ~ n_p / n_t as per frequentist
2362         * probability, we can equate a task's usage of a
2363         * particular page (n_p) per total usage of this
2364         * page (n_t) (in a given time-span) to a probability.
2365         *
2366         * Our periodic faults will sample this probability and
2367         * getting the same result twice in a row, given these
2368         * samples are fully independent, is then given by
2369         * P(n)^2, provided our sample period is sufficiently
2370         * short compared to the usage pattern.
2371         *
2372         * This quadric squishes small probabilities, making
2373         * it less likely we act on an unlikely task<->page
2374         * relation.
2375         */
2376        last_nid = page_nid_xchg_last(page, polnid);
2377        if (last_nid != polnid)
2378            goto out;
2379    }
2380
2381    if (curnid != polnid)
2382        ret = polnid;
2383out:
2384    mpol_cond_put(pol);
2385
2386    return ret;
2387}
2388
2389static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2390{
2391    pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2392    rb_erase(&n->nd, &sp->root);
2393    sp_free(n);
2394}
2395
2396static void sp_node_init(struct sp_node *node, unsigned long start,
2397            unsigned long end, struct mempolicy *pol)
2398{
2399    node->start = start;
2400    node->end = end;
2401    node->policy = pol;
2402}
2403
2404static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2405                struct mempolicy *pol)
2406{
2407    struct sp_node *n;
2408    struct mempolicy *newpol;
2409
2410    n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2411    if (!n)
2412        return NULL;
2413
2414    newpol = mpol_dup(pol);
2415    if (IS_ERR(newpol)) {
2416        kmem_cache_free(sn_cache, n);
2417        return NULL;
2418    }
2419    newpol->flags |= MPOL_F_SHARED;
2420    sp_node_init(n, start, end, newpol);
2421
2422    return n;
2423}
2424
2425/* Replace a policy range. */
2426static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2427                 unsigned long end, struct sp_node *new)
2428{
2429    struct sp_node *n;
2430    struct sp_node *n_new = NULL;
2431    struct mempolicy *mpol_new = NULL;
2432    int ret = 0;
2433
2434restart:
2435    spin_lock(&sp->lock);
2436    n = sp_lookup(sp, start, end);
2437    /* Take care of old policies in the same range. */
2438    while (n && n->start < end) {
2439        struct rb_node *next = rb_next(&n->nd);
2440        if (n->start >= start) {
2441            if (n->end <= end)
2442                sp_delete(sp, n);
2443            else
2444                n->start = end;
2445        } else {
2446            /* Old policy spanning whole new range. */
2447            if (n->end > end) {
2448                if (!n_new)
2449                    goto alloc_new;
2450
2451                *mpol_new = *n->policy;
2452                atomic_set(&mpol_new->refcnt, 1);
2453                sp_node_init(n_new, end, n->end, mpol_new);
2454                n->end = start;
2455                sp_insert(sp, n_new);
2456                n_new = NULL;
2457                mpol_new = NULL;
2458                break;
2459            } else
2460                n->end = start;
2461        }
2462        if (!next)
2463            break;
2464        n = rb_entry(next, struct sp_node, nd);
2465    }
2466    if (new)
2467        sp_insert(sp, new);
2468    spin_unlock(&sp->lock);
2469    ret = 0;
2470
2471err_out:
2472    if (mpol_new)
2473        mpol_put(mpol_new);
2474    if (n_new)
2475        kmem_cache_free(sn_cache, n_new);
2476
2477    return ret;
2478
2479alloc_new:
2480    spin_unlock(&sp->lock);
2481    ret = -ENOMEM;
2482    n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2483    if (!n_new)
2484        goto err_out;
2485    mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2486    if (!mpol_new)
2487        goto err_out;
2488    goto restart;
2489}
2490
2491/**
2492 * mpol_shared_policy_init - initialize shared policy for inode
2493 * @sp: pointer to inode shared policy
2494 * @mpol: struct mempolicy to install
2495 *
2496 * Install non-NULL @mpol in inode's shared policy rb-tree.
2497 * On entry, the current task has a reference on a non-NULL @mpol.
2498 * This must be released on exit.
2499 * This is called at get_inode() calls and we can use GFP_KERNEL.
2500 */
2501void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2502{
2503    int ret;
2504
2505    sp->root = RB_ROOT; /* empty tree == default mempolicy */
2506    spin_lock_init(&sp->lock);
2507
2508    if (mpol) {
2509        struct vm_area_struct pvma;
2510        struct mempolicy *new;
2511        NODEMASK_SCRATCH(scratch);
2512
2513        if (!scratch)
2514            goto put_mpol;
2515        /* contextualize the tmpfs mount point mempolicy */
2516        new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2517        if (IS_ERR(new))
2518            goto free_scratch; /* no valid nodemask intersection */
2519
2520        task_lock(current);
2521        ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2522        task_unlock(current);
2523        if (ret)
2524            goto put_new;
2525
2526        /* Create pseudo-vma that contains just the policy */
2527        memset(&pvma, 0, sizeof(struct vm_area_struct));
2528        pvma.vm_end = TASK_SIZE; /* policy covers entire file */
2529        mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2530
2531put_new:
2532        mpol_put(new); /* drop initial ref */
2533free_scratch:
2534        NODEMASK_SCRATCH_FREE(scratch);
2535put_mpol:
2536        mpol_put(mpol); /* drop our incoming ref on sb mpol */
2537    }
2538}
2539
2540int mpol_set_shared_policy(struct shared_policy *info,
2541            struct vm_area_struct *vma, struct mempolicy *npol)
2542{
2543    int err;
2544    struct sp_node *new = NULL;
2545    unsigned long sz = vma_pages(vma);
2546
2547    pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2548         vma->vm_pgoff,
2549         sz, npol ? npol->mode : -1,
2550         npol ? npol->flags : -1,
2551         npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2552
2553    if (npol) {
2554        new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2555        if (!new)
2556            return -ENOMEM;
2557    }
2558    err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2559    if (err && new)
2560        sp_free(new);
2561    return err;
2562}
2563
2564/* Free a backing policy store on inode delete. */
2565void mpol_free_shared_policy(struct shared_policy *p)
2566{
2567    struct sp_node *n;
2568    struct rb_node *next;
2569
2570    if (!p->root.rb_node)
2571        return;
2572    spin_lock(&p->lock);
2573    next = rb_first(&p->root);
2574    while (next) {
2575        n = rb_entry(next, struct sp_node, nd);
2576        next = rb_next(&n->nd);
2577        sp_delete(p, n);
2578    }
2579    spin_unlock(&p->lock);
2580}
2581
2582#ifdef CONFIG_NUMA_BALANCING
2583static bool __initdata numabalancing_override;
2584
2585static void __init check_numabalancing_enable(void)
2586{
2587    bool numabalancing_default = false;
2588
2589    if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2590        numabalancing_default = true;
2591
2592    if (nr_node_ids > 1 && !numabalancing_override) {
2593        printk(KERN_INFO "Enabling automatic NUMA balancing. "
2594            "Configure with numa_balancing= or sysctl");
2595        set_numabalancing_state(numabalancing_default);
2596    }
2597}
2598
2599static int __init setup_numabalancing(char *str)
2600{
2601    int ret = 0;
2602    if (!str)
2603        goto out;
2604    numabalancing_override = true;
2605
2606    if (!strcmp(str, "enable")) {
2607        set_numabalancing_state(true);
2608        ret = 1;
2609    } else if (!strcmp(str, "disable")) {
2610        set_numabalancing_state(false);
2611        ret = 1;
2612    }
2613out:
2614    if (!ret)
2615        printk(KERN_WARNING "Unable to parse numa_balancing=\n");
2616
2617    return ret;
2618}
2619__setup("numa_balancing=", setup_numabalancing);
2620#else
2621static inline void __init check_numabalancing_enable(void)
2622{
2623}
2624#endif /* CONFIG_NUMA_BALANCING */
2625
2626/* assumes fs == KERNEL_DS */
2627void __init numa_policy_init(void)
2628{
2629    nodemask_t interleave_nodes;
2630    unsigned long largest = 0;
2631    int nid, prefer = 0;
2632
2633    policy_cache = kmem_cache_create("numa_policy",
2634                     sizeof(struct mempolicy),
2635                     0, SLAB_PANIC, NULL);
2636
2637    sn_cache = kmem_cache_create("shared_policy_node",
2638                     sizeof(struct sp_node),
2639                     0, SLAB_PANIC, NULL);
2640
2641    for_each_node(nid) {
2642        preferred_node_policy[nid] = (struct mempolicy) {
2643            .refcnt = ATOMIC_INIT(1),
2644            .mode = MPOL_PREFERRED,
2645            .flags = MPOL_F_MOF | MPOL_F_MORON,
2646            .v = { .preferred_node = nid, },
2647        };
2648    }
2649
2650    /*
2651     * Set interleaving policy for system init. Interleaving is only
2652     * enabled across suitably sized nodes (default is >= 16MB), or
2653     * fall back to the largest node if they're all smaller.
2654     */
2655    nodes_clear(interleave_nodes);
2656    for_each_node_state(nid, N_MEMORY) {
2657        unsigned long total_pages = node_present_pages(nid);
2658
2659        /* Preserve the largest node */
2660        if (largest < total_pages) {
2661            largest = total_pages;
2662            prefer = nid;
2663        }
2664
2665        /* Interleave this node? */
2666        if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2667            node_set(nid, interleave_nodes);
2668    }
2669
2670    /* All too small, use the largest */
2671    if (unlikely(nodes_empty(interleave_nodes)))
2672        node_set(prefer, interleave_nodes);
2673
2674    if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2675        printk("numa_policy_init: interleaving failed\n");
2676
2677    check_numabalancing_enable();
2678}
2679
2680/* Reset policy of current process to default */
2681void numa_default_policy(void)
2682{
2683    do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2684}
2685
2686/*
2687 * Parse and format mempolicy from/to strings
2688 */
2689
2690/*
2691 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2692 */
2693static const char * const policy_modes[] =
2694{
2695    [MPOL_DEFAULT] = "default",
2696    [MPOL_PREFERRED] = "prefer",
2697    [MPOL_BIND] = "bind",
2698    [MPOL_INTERLEAVE] = "interleave",
2699    [MPOL_LOCAL] = "local",
2700};
2701
2702
2703#ifdef CONFIG_TMPFS
2704/**
2705 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2706 * @str: string containing mempolicy to parse
2707 * @mpol: pointer to struct mempolicy pointer, returned on success.
2708 *
2709 * Format of input:
2710 * <mode>[=<flags>][:<nodelist>]
2711 *
2712 * On success, returns 0, else 1
2713 */
2714int mpol_parse_str(char *str, struct mempolicy **mpol)
2715{
2716    struct mempolicy *new = NULL;
2717    unsigned short mode;
2718    unsigned short mode_flags;
2719    nodemask_t nodes;
2720    char *nodelist = strchr(str, ':');
2721    char *flags = strchr(str, '=');
2722    int err = 1;
2723
2724    if (nodelist) {
2725        /* NUL-terminate mode or flags string */
2726        *nodelist++ = '\0';
2727        if (nodelist_parse(nodelist, nodes))
2728            goto out;
2729        if (!nodes_subset(nodes, node_states[N_MEMORY]))
2730            goto out;
2731    } else
2732        nodes_clear(nodes);
2733
2734    if (flags)
2735        *flags++ = '\0'; /* terminate mode string */
2736
2737    for (mode = 0; mode < MPOL_MAX; mode++) {
2738        if (!strcmp(str, policy_modes[mode])) {
2739            break;
2740        }
2741    }
2742    if (mode >= MPOL_MAX)
2743        goto out;
2744
2745    switch (mode) {
2746    case MPOL_PREFERRED:
2747        /*
2748         * Insist on a nodelist of one node only
2749         */
2750        if (nodelist) {
2751            char *rest = nodelist;
2752            while (isdigit(*rest))
2753                rest++;
2754            if (*rest)
2755                goto out;
2756        }
2757        break;
2758    case MPOL_INTERLEAVE:
2759        /*
2760         * Default to online nodes with memory if no nodelist
2761         */
2762        if (!nodelist)
2763            nodes = node_states[N_MEMORY];
2764        break;
2765    case MPOL_LOCAL:
2766        /*
2767         * Don't allow a nodelist; mpol_new() checks flags
2768         */
2769        if (nodelist)
2770            goto out;
2771        mode = MPOL_PREFERRED;
2772        break;
2773    case MPOL_DEFAULT:
2774        /*
2775         * Insist on a empty nodelist
2776         */
2777        if (!nodelist)
2778            err = 0;
2779        goto out;
2780    case MPOL_BIND:
2781        /*
2782         * Insist on a nodelist
2783         */
2784        if (!nodelist)
2785            goto out;
2786    }
2787
2788    mode_flags = 0;
2789    if (flags) {
2790        /*
2791         * Currently, we only support two mutually exclusive
2792         * mode flags.
2793         */
2794        if (!strcmp(flags, "static"))
2795            mode_flags |= MPOL_F_STATIC_NODES;
2796        else if (!strcmp(flags, "relative"))
2797            mode_flags |= MPOL_F_RELATIVE_NODES;
2798        else
2799            goto out;
2800    }
2801
2802    new = mpol_new(mode, mode_flags, &nodes);
2803    if (IS_ERR(new))
2804        goto out;
2805
2806    /*
2807     * Save nodes for mpol_to_str() to show the tmpfs mount options
2808     * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2809     */
2810    if (mode != MPOL_PREFERRED)
2811        new->v.nodes = nodes;
2812    else if (nodelist)
2813        new->v.preferred_node = first_node(nodes);
2814    else
2815        new->flags |= MPOL_F_LOCAL;
2816
2817    /*
2818     * Save nodes for contextualization: this will be used to "clone"
2819     * the mempolicy in a specific context [cpuset] at a later time.
2820     */
2821    new->w.user_nodemask = nodes;
2822
2823    err = 0;
2824
2825out:
2826    /* Restore string for error message */
2827    if (nodelist)
2828        *--nodelist = ':';
2829    if (flags)
2830        *--flags = '=';
2831    if (!err)
2832        *mpol = new;
2833    return err;
2834}
2835#endif /* CONFIG_TMPFS */
2836
2837/**
2838 * mpol_to_str - format a mempolicy structure for printing
2839 * @buffer: to contain formatted mempolicy string
2840 * @maxlen: length of @buffer
2841 * @pol: pointer to mempolicy to be formatted
2842 *
2843 * Convert a mempolicy into a string.
2844 * Returns the number of characters in buffer (if positive)
2845 * or an error (negative)
2846 */
2847int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2848{
2849    char *p = buffer;
2850    int l;
2851    nodemask_t nodes;
2852    unsigned short mode;
2853    unsigned short flags = pol ? pol->flags : 0;
2854
2855    /*
2856     * Sanity check: room for longest mode, flag and some nodes
2857     */
2858    VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2859
2860    if (!pol || pol == &default_policy)
2861        mode = MPOL_DEFAULT;
2862    else
2863        mode = pol->mode;
2864
2865    switch (mode) {
2866    case MPOL_DEFAULT:
2867        nodes_clear(nodes);
2868        break;
2869
2870    case MPOL_PREFERRED:
2871        nodes_clear(nodes);
2872        if (flags & MPOL_F_LOCAL)
2873            mode = MPOL_LOCAL;
2874        else
2875            node_set(pol->v.preferred_node, nodes);
2876        break;
2877
2878    case MPOL_BIND:
2879        /* Fall through */
2880    case MPOL_INTERLEAVE:
2881        nodes = pol->v.nodes;
2882        break;
2883
2884    default:
2885        return -EINVAL;
2886    }
2887
2888    l = strlen(policy_modes[mode]);
2889    if (buffer + maxlen < p + l + 1)
2890        return -ENOSPC;
2891
2892    strcpy(p, policy_modes[mode]);
2893    p += l;
2894
2895    if (flags & MPOL_MODE_FLAGS) {
2896        if (buffer + maxlen < p + 2)
2897            return -ENOSPC;
2898        *p++ = '=';
2899
2900        /*
2901         * Currently, the only defined flags are mutually exclusive
2902         */
2903        if (flags & MPOL_F_STATIC_NODES)
2904            p += snprintf(p, buffer + maxlen - p, "static");
2905        else if (flags & MPOL_F_RELATIVE_NODES)
2906            p += snprintf(p, buffer + maxlen - p, "relative");
2907    }
2908
2909    if (!nodes_empty(nodes)) {
2910        if (buffer + maxlen < p + 2)
2911            return -ENOSPC;
2912        *p++ = ':';
2913         p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2914    }
2915    return p - buffer;
2916}
2917

Archive Download this file



interactive