Root/mm/mremap.c

1/*
2 * mm/mremap.c
3 *
4 * (C) Copyright 1996 Linus Torvalds
5 *
6 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
7 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
8 */
9
10#include <linux/mm.h>
11#include <linux/hugetlb.h>
12#include <linux/shm.h>
13#include <linux/ksm.h>
14#include <linux/mman.h>
15#include <linux/swap.h>
16#include <linux/capability.h>
17#include <linux/fs.h>
18#include <linux/swapops.h>
19#include <linux/highmem.h>
20#include <linux/security.h>
21#include <linux/syscalls.h>
22#include <linux/mmu_notifier.h>
23#include <linux/sched/sysctl.h>
24
25#include <asm/uaccess.h>
26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h>
28
29#include "internal.h"
30
31static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
32{
33    pgd_t *pgd;
34    pud_t *pud;
35    pmd_t *pmd;
36
37    pgd = pgd_offset(mm, addr);
38    if (pgd_none_or_clear_bad(pgd))
39        return NULL;
40
41    pud = pud_offset(pgd, addr);
42    if (pud_none_or_clear_bad(pud))
43        return NULL;
44
45    pmd = pmd_offset(pud, addr);
46    if (pmd_none(*pmd))
47        return NULL;
48
49    return pmd;
50}
51
52static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
53                unsigned long addr)
54{
55    pgd_t *pgd;
56    pud_t *pud;
57    pmd_t *pmd;
58
59    pgd = pgd_offset(mm, addr);
60    pud = pud_alloc(mm, pgd, addr);
61    if (!pud)
62        return NULL;
63
64    pmd = pmd_alloc(mm, pud, addr);
65    if (!pmd)
66        return NULL;
67
68    VM_BUG_ON(pmd_trans_huge(*pmd));
69
70    return pmd;
71}
72
73static pte_t move_soft_dirty_pte(pte_t pte)
74{
75    /*
76     * Set soft dirty bit so we can notice
77     * in userspace the ptes were moved.
78     */
79#ifdef CONFIG_MEM_SOFT_DIRTY
80    if (pte_present(pte))
81        pte = pte_mksoft_dirty(pte);
82    else if (is_swap_pte(pte))
83        pte = pte_swp_mksoft_dirty(pte);
84    else if (pte_file(pte))
85        pte = pte_file_mksoft_dirty(pte);
86#endif
87    return pte;
88}
89
90static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
91        unsigned long old_addr, unsigned long old_end,
92        struct vm_area_struct *new_vma, pmd_t *new_pmd,
93        unsigned long new_addr, bool need_rmap_locks)
94{
95    struct address_space *mapping = NULL;
96    struct anon_vma *anon_vma = NULL;
97    struct mm_struct *mm = vma->vm_mm;
98    pte_t *old_pte, *new_pte, pte;
99    spinlock_t *old_ptl, *new_ptl;
100
101    /*
102     * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
103     * locks to ensure that rmap will always observe either the old or the
104     * new ptes. This is the easiest way to avoid races with
105     * truncate_pagecache(), page migration, etc...
106     *
107     * When need_rmap_locks is false, we use other ways to avoid
108     * such races:
109     *
110     * - During exec() shift_arg_pages(), we use a specially tagged vma
111     * which rmap call sites look for using is_vma_temporary_stack().
112     *
113     * - During mremap(), new_vma is often known to be placed after vma
114     * in rmap traversal order. This ensures rmap will always observe
115     * either the old pte, or the new pte, or both (the page table locks
116     * serialize access to individual ptes, but only rmap traversal
117     * order guarantees that we won't miss both the old and new ptes).
118     */
119    if (need_rmap_locks) {
120        if (vma->vm_file) {
121            mapping = vma->vm_file->f_mapping;
122            mutex_lock(&mapping->i_mmap_mutex);
123        }
124        if (vma->anon_vma) {
125            anon_vma = vma->anon_vma;
126            anon_vma_lock_write(anon_vma);
127        }
128    }
129
130    /*
131     * We don't have to worry about the ordering of src and dst
132     * pte locks because exclusive mmap_sem prevents deadlock.
133     */
134    old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
135    new_pte = pte_offset_map(new_pmd, new_addr);
136    new_ptl = pte_lockptr(mm, new_pmd);
137    if (new_ptl != old_ptl)
138        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
139    arch_enter_lazy_mmu_mode();
140
141    for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
142                   new_pte++, new_addr += PAGE_SIZE) {
143        if (pte_none(*old_pte))
144            continue;
145        pte = ptep_get_and_clear(mm, old_addr, old_pte);
146        pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
147        pte = move_soft_dirty_pte(pte);
148        set_pte_at(mm, new_addr, new_pte, pte);
149    }
150
151    arch_leave_lazy_mmu_mode();
152    if (new_ptl != old_ptl)
153        spin_unlock(new_ptl);
154    pte_unmap(new_pte - 1);
155    pte_unmap_unlock(old_pte - 1, old_ptl);
156    if (anon_vma)
157        anon_vma_unlock_write(anon_vma);
158    if (mapping)
159        mutex_unlock(&mapping->i_mmap_mutex);
160}
161
162#define LATENCY_LIMIT (64 * PAGE_SIZE)
163
164unsigned long move_page_tables(struct vm_area_struct *vma,
165        unsigned long old_addr, struct vm_area_struct *new_vma,
166        unsigned long new_addr, unsigned long len,
167        bool need_rmap_locks)
168{
169    unsigned long extent, next, old_end;
170    pmd_t *old_pmd, *new_pmd;
171    bool need_flush = false;
172    unsigned long mmun_start; /* For mmu_notifiers */
173    unsigned long mmun_end; /* For mmu_notifiers */
174
175    old_end = old_addr + len;
176    flush_cache_range(vma, old_addr, old_end);
177
178    mmun_start = old_addr;
179    mmun_end = old_end;
180    mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
181
182    for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
183        cond_resched();
184        next = (old_addr + PMD_SIZE) & PMD_MASK;
185        /* even if next overflowed, extent below will be ok */
186        extent = next - old_addr;
187        if (extent > old_end - old_addr)
188            extent = old_end - old_addr;
189        old_pmd = get_old_pmd(vma->vm_mm, old_addr);
190        if (!old_pmd)
191            continue;
192        new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
193        if (!new_pmd)
194            break;
195        if (pmd_trans_huge(*old_pmd)) {
196            int err = 0;
197            if (extent == HPAGE_PMD_SIZE)
198                err = move_huge_pmd(vma, new_vma, old_addr,
199                            new_addr, old_end,
200                            old_pmd, new_pmd);
201            if (err > 0) {
202                need_flush = true;
203                continue;
204            } else if (!err) {
205                split_huge_page_pmd(vma, old_addr, old_pmd);
206            }
207            VM_BUG_ON(pmd_trans_huge(*old_pmd));
208        }
209        if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
210                              new_pmd, new_addr))
211            break;
212        next = (new_addr + PMD_SIZE) & PMD_MASK;
213        if (extent > next - new_addr)
214            extent = next - new_addr;
215        if (extent > LATENCY_LIMIT)
216            extent = LATENCY_LIMIT;
217        move_ptes(vma, old_pmd, old_addr, old_addr + extent,
218              new_vma, new_pmd, new_addr, need_rmap_locks);
219        need_flush = true;
220    }
221    if (likely(need_flush))
222        flush_tlb_range(vma, old_end-len, old_addr);
223
224    mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
225
226    return len + old_addr - old_end; /* how much done */
227}
228
229static unsigned long move_vma(struct vm_area_struct *vma,
230        unsigned long old_addr, unsigned long old_len,
231        unsigned long new_len, unsigned long new_addr, bool *locked)
232{
233    struct mm_struct *mm = vma->vm_mm;
234    struct vm_area_struct *new_vma;
235    unsigned long vm_flags = vma->vm_flags;
236    unsigned long new_pgoff;
237    unsigned long moved_len;
238    unsigned long excess = 0;
239    unsigned long hiwater_vm;
240    int split = 0;
241    int err;
242    bool need_rmap_locks;
243
244    /*
245     * We'd prefer to avoid failure later on in do_munmap:
246     * which may split one vma into three before unmapping.
247     */
248    if (mm->map_count >= sysctl_max_map_count - 3)
249        return -ENOMEM;
250
251    /*
252     * Advise KSM to break any KSM pages in the area to be moved:
253     * it would be confusing if they were to turn up at the new
254     * location, where they happen to coincide with different KSM
255     * pages recently unmapped. But leave vma->vm_flags as it was,
256     * so KSM can come around to merge on vma and new_vma afterwards.
257     */
258    err = ksm_madvise(vma, old_addr, old_addr + old_len,
259                        MADV_UNMERGEABLE, &vm_flags);
260    if (err)
261        return err;
262
263    new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
264    new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
265               &need_rmap_locks);
266    if (!new_vma)
267        return -ENOMEM;
268
269    moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
270                     need_rmap_locks);
271    if (moved_len < old_len) {
272        /*
273         * On error, move entries back from new area to old,
274         * which will succeed since page tables still there,
275         * and then proceed to unmap new area instead of old.
276         */
277        move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
278                 true);
279        vma = new_vma;
280        old_len = new_len;
281        old_addr = new_addr;
282        new_addr = -ENOMEM;
283    }
284
285    /* Conceal VM_ACCOUNT so old reservation is not undone */
286    if (vm_flags & VM_ACCOUNT) {
287        vma->vm_flags &= ~VM_ACCOUNT;
288        excess = vma->vm_end - vma->vm_start - old_len;
289        if (old_addr > vma->vm_start &&
290            old_addr + old_len < vma->vm_end)
291            split = 1;
292    }
293
294    /*
295     * If we failed to move page tables we still do total_vm increment
296     * since do_munmap() will decrement it by old_len == new_len.
297     *
298     * Since total_vm is about to be raised artificially high for a
299     * moment, we need to restore high watermark afterwards: if stats
300     * are taken meanwhile, total_vm and hiwater_vm appear too high.
301     * If this were a serious issue, we'd add a flag to do_munmap().
302     */
303    hiwater_vm = mm->hiwater_vm;
304    vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
305
306    if (do_munmap(mm, old_addr, old_len) < 0) {
307        /* OOM: unable to split vma, just get accounts right */
308        vm_unacct_memory(excess >> PAGE_SHIFT);
309        excess = 0;
310    }
311    mm->hiwater_vm = hiwater_vm;
312
313    /* Restore VM_ACCOUNT if one or two pieces of vma left */
314    if (excess) {
315        vma->vm_flags |= VM_ACCOUNT;
316        if (split)
317            vma->vm_next->vm_flags |= VM_ACCOUNT;
318    }
319
320    if (vm_flags & VM_LOCKED) {
321        mm->locked_vm += new_len >> PAGE_SHIFT;
322        *locked = true;
323    }
324
325    return new_addr;
326}
327
328static struct vm_area_struct *vma_to_resize(unsigned long addr,
329    unsigned long old_len, unsigned long new_len, unsigned long *p)
330{
331    struct mm_struct *mm = current->mm;
332    struct vm_area_struct *vma = find_vma(mm, addr);
333
334    if (!vma || vma->vm_start > addr)
335        goto Efault;
336
337    if (is_vm_hugetlb_page(vma))
338        goto Einval;
339
340    /* We can't remap across vm area boundaries */
341    if (old_len > vma->vm_end - addr)
342        goto Efault;
343
344    /* Need to be careful about a growing mapping */
345    if (new_len > old_len) {
346        unsigned long pgoff;
347
348        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
349            goto Efault;
350        pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
351        pgoff += vma->vm_pgoff;
352        if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
353            goto Einval;
354    }
355
356    if (vma->vm_flags & VM_LOCKED) {
357        unsigned long locked, lock_limit;
358        locked = mm->locked_vm << PAGE_SHIFT;
359        lock_limit = rlimit(RLIMIT_MEMLOCK);
360        locked += new_len - old_len;
361        if (locked > lock_limit && !capable(CAP_IPC_LOCK))
362            goto Eagain;
363    }
364
365    if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
366        goto Enomem;
367
368    if (vma->vm_flags & VM_ACCOUNT) {
369        unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
370        if (security_vm_enough_memory_mm(mm, charged))
371            goto Efault;
372        *p = charged;
373    }
374
375    return vma;
376
377Efault: /* very odd choice for most of the cases, but... */
378    return ERR_PTR(-EFAULT);
379Einval:
380    return ERR_PTR(-EINVAL);
381Enomem:
382    return ERR_PTR(-ENOMEM);
383Eagain:
384    return ERR_PTR(-EAGAIN);
385}
386
387static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
388        unsigned long new_addr, unsigned long new_len, bool *locked)
389{
390    struct mm_struct *mm = current->mm;
391    struct vm_area_struct *vma;
392    unsigned long ret = -EINVAL;
393    unsigned long charged = 0;
394    unsigned long map_flags;
395
396    if (new_addr & ~PAGE_MASK)
397        goto out;
398
399    if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
400        goto out;
401
402    /* Check if the location we're moving into overlaps the
403     * old location at all, and fail if it does.
404     */
405    if ((new_addr <= addr) && (new_addr+new_len) > addr)
406        goto out;
407
408    if ((addr <= new_addr) && (addr+old_len) > new_addr)
409        goto out;
410
411    ret = do_munmap(mm, new_addr, new_len);
412    if (ret)
413        goto out;
414
415    if (old_len >= new_len) {
416        ret = do_munmap(mm, addr+new_len, old_len - new_len);
417        if (ret && old_len != new_len)
418            goto out;
419        old_len = new_len;
420    }
421
422    vma = vma_to_resize(addr, old_len, new_len, &charged);
423    if (IS_ERR(vma)) {
424        ret = PTR_ERR(vma);
425        goto out;
426    }
427
428    map_flags = MAP_FIXED;
429    if (vma->vm_flags & VM_MAYSHARE)
430        map_flags |= MAP_SHARED;
431
432    ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
433                ((addr - vma->vm_start) >> PAGE_SHIFT),
434                map_flags);
435    if (ret & ~PAGE_MASK)
436        goto out1;
437
438    ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
439    if (!(ret & ~PAGE_MASK))
440        goto out;
441out1:
442    vm_unacct_memory(charged);
443
444out:
445    return ret;
446}
447
448static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
449{
450    unsigned long end = vma->vm_end + delta;
451    if (end < vma->vm_end) /* overflow */
452        return 0;
453    if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
454        return 0;
455    if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
456                  0, MAP_FIXED) & ~PAGE_MASK)
457        return 0;
458    return 1;
459}
460
461/*
462 * Expand (or shrink) an existing mapping, potentially moving it at the
463 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
464 *
465 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
466 * This option implies MREMAP_MAYMOVE.
467 */
468SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
469        unsigned long, new_len, unsigned long, flags,
470        unsigned long, new_addr)
471{
472    struct mm_struct *mm = current->mm;
473    struct vm_area_struct *vma;
474    unsigned long ret = -EINVAL;
475    unsigned long charged = 0;
476    bool locked = false;
477
478    if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
479        return ret;
480
481    if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
482        return ret;
483
484    if (addr & ~PAGE_MASK)
485        return ret;
486
487    old_len = PAGE_ALIGN(old_len);
488    new_len = PAGE_ALIGN(new_len);
489
490    /*
491     * We allow a zero old-len as a special case
492     * for DOS-emu "duplicate shm area" thing. But
493     * a zero new-len is nonsensical.
494     */
495    if (!new_len)
496        return ret;
497
498    down_write(&current->mm->mmap_sem);
499
500    if (flags & MREMAP_FIXED) {
501        ret = mremap_to(addr, old_len, new_addr, new_len,
502                &locked);
503        goto out;
504    }
505
506    /*
507     * Always allow a shrinking remap: that just unmaps
508     * the unnecessary pages..
509     * do_munmap does all the needed commit accounting
510     */
511    if (old_len >= new_len) {
512        ret = do_munmap(mm, addr+new_len, old_len - new_len);
513        if (ret && old_len != new_len)
514            goto out;
515        ret = addr;
516        goto out;
517    }
518
519    /*
520     * Ok, we need to grow..
521     */
522    vma = vma_to_resize(addr, old_len, new_len, &charged);
523    if (IS_ERR(vma)) {
524        ret = PTR_ERR(vma);
525        goto out;
526    }
527
528    /* old_len exactly to the end of the area..
529     */
530    if (old_len == vma->vm_end - addr) {
531        /* can we just expand the current mapping? */
532        if (vma_expandable(vma, new_len - old_len)) {
533            int pages = (new_len - old_len) >> PAGE_SHIFT;
534
535            if (vma_adjust(vma, vma->vm_start, addr + new_len,
536                       vma->vm_pgoff, NULL)) {
537                ret = -ENOMEM;
538                goto out;
539            }
540
541            vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
542            if (vma->vm_flags & VM_LOCKED) {
543                mm->locked_vm += pages;
544                locked = true;
545                new_addr = addr;
546            }
547            ret = addr;
548            goto out;
549        }
550    }
551
552    /*
553     * We weren't able to just expand or shrink the area,
554     * we need to create a new one and move it..
555     */
556    ret = -ENOMEM;
557    if (flags & MREMAP_MAYMOVE) {
558        unsigned long map_flags = 0;
559        if (vma->vm_flags & VM_MAYSHARE)
560            map_flags |= MAP_SHARED;
561
562        new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
563                    vma->vm_pgoff +
564                    ((addr - vma->vm_start) >> PAGE_SHIFT),
565                    map_flags);
566        if (new_addr & ~PAGE_MASK) {
567            ret = new_addr;
568            goto out;
569        }
570
571        ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
572    }
573out:
574    if (ret & ~PAGE_MASK)
575        vm_unacct_memory(charged);
576    up_write(&current->mm->mmap_sem);
577    if (locked && new_len > old_len)
578        mm_populate(new_addr + old_len, new_len - old_len);
579    return ret;
580}
581

Archive Download this file



interactive