Root/mm/mremap.c

1/*
2 * mm/mremap.c
3 *
4 * (C) Copyright 1996 Linus Torvalds
5 *
6 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
7 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
8 */
9
10#include <linux/mm.h>
11#include <linux/hugetlb.h>
12#include <linux/shm.h>
13#include <linux/ksm.h>
14#include <linux/mman.h>
15#include <linux/swap.h>
16#include <linux/capability.h>
17#include <linux/fs.h>
18#include <linux/highmem.h>
19#include <linux/security.h>
20#include <linux/syscalls.h>
21#include <linux/mmu_notifier.h>
22
23#include <asm/uaccess.h>
24#include <asm/cacheflush.h>
25#include <asm/tlbflush.h>
26
27#include "internal.h"
28
29static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
30{
31    pgd_t *pgd;
32    pud_t *pud;
33    pmd_t *pmd;
34
35    pgd = pgd_offset(mm, addr);
36    if (pgd_none_or_clear_bad(pgd))
37        return NULL;
38
39    pud = pud_offset(pgd, addr);
40    if (pud_none_or_clear_bad(pud))
41        return NULL;
42
43    pmd = pmd_offset(pud, addr);
44    if (pmd_none(*pmd))
45        return NULL;
46
47    return pmd;
48}
49
50static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
51                unsigned long addr)
52{
53    pgd_t *pgd;
54    pud_t *pud;
55    pmd_t *pmd;
56
57    pgd = pgd_offset(mm, addr);
58    pud = pud_alloc(mm, pgd, addr);
59    if (!pud)
60        return NULL;
61
62    pmd = pmd_alloc(mm, pud, addr);
63    if (!pmd)
64        return NULL;
65
66    VM_BUG_ON(pmd_trans_huge(*pmd));
67
68    return pmd;
69}
70
71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
72        unsigned long old_addr, unsigned long old_end,
73        struct vm_area_struct *new_vma, pmd_t *new_pmd,
74        unsigned long new_addr)
75{
76    struct address_space *mapping = NULL;
77    struct mm_struct *mm = vma->vm_mm;
78    pte_t *old_pte, *new_pte, pte;
79    spinlock_t *old_ptl, *new_ptl;
80
81    if (vma->vm_file) {
82        /*
83         * Subtle point from Rajesh Venkatasubramanian: before
84         * moving file-based ptes, we must lock truncate_pagecache
85         * out, since it might clean the dst vma before the src vma,
86         * and we propagate stale pages into the dst afterward.
87         */
88        mapping = vma->vm_file->f_mapping;
89        mutex_lock(&mapping->i_mmap_mutex);
90    }
91
92    /*
93     * We don't have to worry about the ordering of src and dst
94     * pte locks because exclusive mmap_sem prevents deadlock.
95     */
96    old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
97    new_pte = pte_offset_map(new_pmd, new_addr);
98    new_ptl = pte_lockptr(mm, new_pmd);
99    if (new_ptl != old_ptl)
100        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
101    arch_enter_lazy_mmu_mode();
102
103    for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
104                   new_pte++, new_addr += PAGE_SIZE) {
105        if (pte_none(*old_pte))
106            continue;
107        pte = ptep_get_and_clear(mm, old_addr, old_pte);
108        pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
109        set_pte_at(mm, new_addr, new_pte, pte);
110    }
111
112    arch_leave_lazy_mmu_mode();
113    if (new_ptl != old_ptl)
114        spin_unlock(new_ptl);
115    pte_unmap(new_pte - 1);
116    pte_unmap_unlock(old_pte - 1, old_ptl);
117    if (mapping)
118        mutex_unlock(&mapping->i_mmap_mutex);
119}
120
121#define LATENCY_LIMIT (64 * PAGE_SIZE)
122
123unsigned long move_page_tables(struct vm_area_struct *vma,
124        unsigned long old_addr, struct vm_area_struct *new_vma,
125        unsigned long new_addr, unsigned long len)
126{
127    unsigned long extent, next, old_end;
128    pmd_t *old_pmd, *new_pmd;
129    bool need_flush = false;
130
131    old_end = old_addr + len;
132    flush_cache_range(vma, old_addr, old_end);
133
134    mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
135
136    for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
137        cond_resched();
138        next = (old_addr + PMD_SIZE) & PMD_MASK;
139        /* even if next overflowed, extent below will be ok */
140        extent = next - old_addr;
141        if (extent > old_end - old_addr)
142            extent = old_end - old_addr;
143        old_pmd = get_old_pmd(vma->vm_mm, old_addr);
144        if (!old_pmd)
145            continue;
146        new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
147        if (!new_pmd)
148            break;
149        if (pmd_trans_huge(*old_pmd)) {
150            int err = 0;
151            if (extent == HPAGE_PMD_SIZE)
152                err = move_huge_pmd(vma, new_vma, old_addr,
153                            new_addr, old_end,
154                            old_pmd, new_pmd);
155            if (err > 0) {
156                need_flush = true;
157                continue;
158            } else if (!err) {
159                split_huge_page_pmd(vma->vm_mm, old_pmd);
160            }
161            VM_BUG_ON(pmd_trans_huge(*old_pmd));
162        }
163        if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
164                              new_pmd, new_addr))
165            break;
166        next = (new_addr + PMD_SIZE) & PMD_MASK;
167        if (extent > next - new_addr)
168            extent = next - new_addr;
169        if (extent > LATENCY_LIMIT)
170            extent = LATENCY_LIMIT;
171        move_ptes(vma, old_pmd, old_addr, old_addr + extent,
172                new_vma, new_pmd, new_addr);
173        need_flush = true;
174    }
175    if (likely(need_flush))
176        flush_tlb_range(vma, old_end-len, old_addr);
177
178    mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
179
180    return len + old_addr - old_end; /* how much done */
181}
182
183static unsigned long move_vma(struct vm_area_struct *vma,
184        unsigned long old_addr, unsigned long old_len,
185        unsigned long new_len, unsigned long new_addr)
186{
187    struct mm_struct *mm = vma->vm_mm;
188    struct vm_area_struct *new_vma;
189    unsigned long vm_flags = vma->vm_flags;
190    unsigned long new_pgoff;
191    unsigned long moved_len;
192    unsigned long excess = 0;
193    unsigned long hiwater_vm;
194    int split = 0;
195    int err;
196
197    /*
198     * We'd prefer to avoid failure later on in do_munmap:
199     * which may split one vma into three before unmapping.
200     */
201    if (mm->map_count >= sysctl_max_map_count - 3)
202        return -ENOMEM;
203
204    /*
205     * Advise KSM to break any KSM pages in the area to be moved:
206     * it would be confusing if they were to turn up at the new
207     * location, where they happen to coincide with different KSM
208     * pages recently unmapped. But leave vma->vm_flags as it was,
209     * so KSM can come around to merge on vma and new_vma afterwards.
210     */
211    err = ksm_madvise(vma, old_addr, old_addr + old_len,
212                        MADV_UNMERGEABLE, &vm_flags);
213    if (err)
214        return err;
215
216    new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
217    new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
218    if (!new_vma)
219        return -ENOMEM;
220
221    moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
222    if (moved_len < old_len) {
223        /*
224         * Before moving the page tables from the new vma to
225         * the old vma, we need to be sure the old vma is
226         * queued after new vma in the same_anon_vma list to
227         * prevent SMP races with rmap_walk (that could lead
228         * rmap_walk to miss some page table).
229         */
230        anon_vma_moveto_tail(vma);
231
232        /*
233         * On error, move entries back from new area to old,
234         * which will succeed since page tables still there,
235         * and then proceed to unmap new area instead of old.
236         */
237        move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
238        vma = new_vma;
239        old_len = new_len;
240        old_addr = new_addr;
241        new_addr = -ENOMEM;
242    }
243
244    /* Conceal VM_ACCOUNT so old reservation is not undone */
245    if (vm_flags & VM_ACCOUNT) {
246        vma->vm_flags &= ~VM_ACCOUNT;
247        excess = vma->vm_end - vma->vm_start - old_len;
248        if (old_addr > vma->vm_start &&
249            old_addr + old_len < vma->vm_end)
250            split = 1;
251    }
252
253    /*
254     * If we failed to move page tables we still do total_vm increment
255     * since do_munmap() will decrement it by old_len == new_len.
256     *
257     * Since total_vm is about to be raised artificially high for a
258     * moment, we need to restore high watermark afterwards: if stats
259     * are taken meanwhile, total_vm and hiwater_vm appear too high.
260     * If this were a serious issue, we'd add a flag to do_munmap().
261     */
262    hiwater_vm = mm->hiwater_vm;
263    vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
264
265    if (do_munmap(mm, old_addr, old_len) < 0) {
266        /* OOM: unable to split vma, just get accounts right */
267        vm_unacct_memory(excess >> PAGE_SHIFT);
268        excess = 0;
269    }
270    mm->hiwater_vm = hiwater_vm;
271
272    /* Restore VM_ACCOUNT if one or two pieces of vma left */
273    if (excess) {
274        vma->vm_flags |= VM_ACCOUNT;
275        if (split)
276            vma->vm_next->vm_flags |= VM_ACCOUNT;
277    }
278
279    if (vm_flags & VM_LOCKED) {
280        mm->locked_vm += new_len >> PAGE_SHIFT;
281        if (new_len > old_len)
282            mlock_vma_pages_range(new_vma, new_addr + old_len,
283                               new_addr + new_len);
284    }
285
286    return new_addr;
287}
288
289static struct vm_area_struct *vma_to_resize(unsigned long addr,
290    unsigned long old_len, unsigned long new_len, unsigned long *p)
291{
292    struct mm_struct *mm = current->mm;
293    struct vm_area_struct *vma = find_vma(mm, addr);
294
295    if (!vma || vma->vm_start > addr)
296        goto Efault;
297
298    if (is_vm_hugetlb_page(vma))
299        goto Einval;
300
301    /* We can't remap across vm area boundaries */
302    if (old_len > vma->vm_end - addr)
303        goto Efault;
304
305    /* Need to be careful about a growing mapping */
306    if (new_len > old_len) {
307        unsigned long pgoff;
308
309        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
310            goto Efault;
311        pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
312        pgoff += vma->vm_pgoff;
313        if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
314            goto Einval;
315    }
316
317    if (vma->vm_flags & VM_LOCKED) {
318        unsigned long locked, lock_limit;
319        locked = mm->locked_vm << PAGE_SHIFT;
320        lock_limit = rlimit(RLIMIT_MEMLOCK);
321        locked += new_len - old_len;
322        if (locked > lock_limit && !capable(CAP_IPC_LOCK))
323            goto Eagain;
324    }
325
326    if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
327        goto Enomem;
328
329    if (vma->vm_flags & VM_ACCOUNT) {
330        unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
331        if (security_vm_enough_memory_mm(mm, charged))
332            goto Efault;
333        *p = charged;
334    }
335
336    return vma;
337
338Efault: /* very odd choice for most of the cases, but... */
339    return ERR_PTR(-EFAULT);
340Einval:
341    return ERR_PTR(-EINVAL);
342Enomem:
343    return ERR_PTR(-ENOMEM);
344Eagain:
345    return ERR_PTR(-EAGAIN);
346}
347
348static unsigned long mremap_to(unsigned long addr,
349    unsigned long old_len, unsigned long new_addr,
350    unsigned long new_len)
351{
352    struct mm_struct *mm = current->mm;
353    struct vm_area_struct *vma;
354    unsigned long ret = -EINVAL;
355    unsigned long charged = 0;
356    unsigned long map_flags;
357
358    if (new_addr & ~PAGE_MASK)
359        goto out;
360
361    if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
362        goto out;
363
364    /* Check if the location we're moving into overlaps the
365     * old location at all, and fail if it does.
366     */
367    if ((new_addr <= addr) && (new_addr+new_len) > addr)
368        goto out;
369
370    if ((addr <= new_addr) && (addr+old_len) > new_addr)
371        goto out;
372
373    ret = do_munmap(mm, new_addr, new_len);
374    if (ret)
375        goto out;
376
377    if (old_len >= new_len) {
378        ret = do_munmap(mm, addr+new_len, old_len - new_len);
379        if (ret && old_len != new_len)
380            goto out;
381        old_len = new_len;
382    }
383
384    vma = vma_to_resize(addr, old_len, new_len, &charged);
385    if (IS_ERR(vma)) {
386        ret = PTR_ERR(vma);
387        goto out;
388    }
389
390    map_flags = MAP_FIXED;
391    if (vma->vm_flags & VM_MAYSHARE)
392        map_flags |= MAP_SHARED;
393
394    ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
395                ((addr - vma->vm_start) >> PAGE_SHIFT),
396                map_flags);
397    if (ret & ~PAGE_MASK)
398        goto out1;
399
400    ret = move_vma(vma, addr, old_len, new_len, new_addr);
401    if (!(ret & ~PAGE_MASK))
402        goto out;
403out1:
404    vm_unacct_memory(charged);
405
406out:
407    return ret;
408}
409
410static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
411{
412    unsigned long end = vma->vm_end + delta;
413    if (end < vma->vm_end) /* overflow */
414        return 0;
415    if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
416        return 0;
417    if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
418                  0, MAP_FIXED) & ~PAGE_MASK)
419        return 0;
420    return 1;
421}
422
423/*
424 * Expand (or shrink) an existing mapping, potentially moving it at the
425 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
426 *
427 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
428 * This option implies MREMAP_MAYMOVE.
429 */
430SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
431        unsigned long, new_len, unsigned long, flags,
432        unsigned long, new_addr)
433{
434    struct mm_struct *mm = current->mm;
435    struct vm_area_struct *vma;
436    unsigned long ret = -EINVAL;
437    unsigned long charged = 0;
438
439    down_write(&current->mm->mmap_sem);
440
441    if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
442        goto out;
443
444    if (addr & ~PAGE_MASK)
445        goto out;
446
447    old_len = PAGE_ALIGN(old_len);
448    new_len = PAGE_ALIGN(new_len);
449
450    /*
451     * We allow a zero old-len as a special case
452     * for DOS-emu "duplicate shm area" thing. But
453     * a zero new-len is nonsensical.
454     */
455    if (!new_len)
456        goto out;
457
458    if (flags & MREMAP_FIXED) {
459        if (flags & MREMAP_MAYMOVE)
460            ret = mremap_to(addr, old_len, new_addr, new_len);
461        goto out;
462    }
463
464    /*
465     * Always allow a shrinking remap: that just unmaps
466     * the unnecessary pages..
467     * do_munmap does all the needed commit accounting
468     */
469    if (old_len >= new_len) {
470        ret = do_munmap(mm, addr+new_len, old_len - new_len);
471        if (ret && old_len != new_len)
472            goto out;
473        ret = addr;
474        goto out;
475    }
476
477    /*
478     * Ok, we need to grow..
479     */
480    vma = vma_to_resize(addr, old_len, new_len, &charged);
481    if (IS_ERR(vma)) {
482        ret = PTR_ERR(vma);
483        goto out;
484    }
485
486    /* old_len exactly to the end of the area..
487     */
488    if (old_len == vma->vm_end - addr) {
489        /* can we just expand the current mapping? */
490        if (vma_expandable(vma, new_len - old_len)) {
491            int pages = (new_len - old_len) >> PAGE_SHIFT;
492
493            if (vma_adjust(vma, vma->vm_start, addr + new_len,
494                       vma->vm_pgoff, NULL)) {
495                ret = -ENOMEM;
496                goto out;
497            }
498
499            vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
500            if (vma->vm_flags & VM_LOCKED) {
501                mm->locked_vm += pages;
502                mlock_vma_pages_range(vma, addr + old_len,
503                           addr + new_len);
504            }
505            ret = addr;
506            goto out;
507        }
508    }
509
510    /*
511     * We weren't able to just expand or shrink the area,
512     * we need to create a new one and move it..
513     */
514    ret = -ENOMEM;
515    if (flags & MREMAP_MAYMOVE) {
516        unsigned long map_flags = 0;
517        if (vma->vm_flags & VM_MAYSHARE)
518            map_flags |= MAP_SHARED;
519
520        new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
521                    vma->vm_pgoff +
522                    ((addr - vma->vm_start) >> PAGE_SHIFT),
523                    map_flags);
524        if (new_addr & ~PAGE_MASK) {
525            ret = new_addr;
526            goto out;
527        }
528
529        ret = move_vma(vma, addr, old_len, new_len, new_addr);
530    }
531out:
532    if (ret & ~PAGE_MASK)
533        vm_unacct_memory(charged);
534    up_write(&current->mm->mmap_sem);
535    return ret;
536}
537

Archive Download this file



interactive