Root/mm/madvise.c

1/*
2 * linux/mm/madvise.c
3 *
4 * Copyright (C) 1999 Linus Torvalds
5 * Copyright (C) 2002 Christoph Hellwig
6 */
7
8#include <linux/mman.h>
9#include <linux/pagemap.h>
10#include <linux/syscalls.h>
11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h>
13#include <linux/hugetlb.h>
14#include <linux/sched.h>
15#include <linux/ksm.h>
16
17/*
18 * Any behaviour which results in changes to the vma->vm_flags needs to
19 * take mmap_sem for writing. Others, which simply traverse vmas, need
20 * to only take it for reading.
21 */
22static int madvise_need_mmap_write(int behavior)
23{
24    switch (behavior) {
25    case MADV_REMOVE:
26    case MADV_WILLNEED:
27    case MADV_DONTNEED:
28        return 0;
29    default:
30        /* be safe, default to 1. list exceptions explicitly */
31        return 1;
32    }
33}
34
35/*
36 * We can potentially split a vm area into separate
37 * areas, each area with its own behavior.
38 */
39static long madvise_behavior(struct vm_area_struct * vma,
40             struct vm_area_struct **prev,
41             unsigned long start, unsigned long end, int behavior)
42{
43    struct mm_struct * mm = vma->vm_mm;
44    int error = 0;
45    pgoff_t pgoff;
46    unsigned long new_flags = vma->vm_flags;
47
48    switch (behavior) {
49    case MADV_NORMAL:
50        new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
51        break;
52    case MADV_SEQUENTIAL:
53        new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
54        break;
55    case MADV_RANDOM:
56        new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
57        break;
58    case MADV_DONTFORK:
59        new_flags |= VM_DONTCOPY;
60        break;
61    case MADV_DOFORK:
62        if (vma->vm_flags & VM_IO) {
63            error = -EINVAL;
64            goto out;
65        }
66        new_flags &= ~VM_DONTCOPY;
67        break;
68    case MADV_MERGEABLE:
69    case MADV_UNMERGEABLE:
70        error = ksm_madvise(vma, start, end, behavior, &new_flags);
71        if (error)
72            goto out;
73        break;
74    case MADV_HUGEPAGE:
75    case MADV_NOHUGEPAGE:
76        error = hugepage_madvise(vma, &new_flags, behavior);
77        if (error)
78            goto out;
79        break;
80    }
81
82    if (new_flags == vma->vm_flags) {
83        *prev = vma;
84        goto out;
85    }
86
87    pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
88    *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
89                vma->vm_file, pgoff, vma_policy(vma));
90    if (*prev) {
91        vma = *prev;
92        goto success;
93    }
94
95    *prev = vma;
96
97    if (start != vma->vm_start) {
98        error = split_vma(mm, vma, start, 1);
99        if (error)
100            goto out;
101    }
102
103    if (end != vma->vm_end) {
104        error = split_vma(mm, vma, end, 0);
105        if (error)
106            goto out;
107    }
108
109success:
110    /*
111     * vm_flags is protected by the mmap_sem held in write mode.
112     */
113    vma->vm_flags = new_flags;
114
115out:
116    if (error == -ENOMEM)
117        error = -EAGAIN;
118    return error;
119}
120
121/*
122 * Schedule all required I/O operations. Do not wait for completion.
123 */
124static long madvise_willneed(struct vm_area_struct * vma,
125                 struct vm_area_struct ** prev,
126                 unsigned long start, unsigned long end)
127{
128    struct file *file = vma->vm_file;
129
130    if (!file)
131        return -EBADF;
132
133    if (file->f_mapping->a_ops->get_xip_mem) {
134        /* no bad return value, but ignore advice */
135        return 0;
136    }
137
138    *prev = vma;
139    start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
140    if (end > vma->vm_end)
141        end = vma->vm_end;
142    end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
143
144    force_page_cache_readahead(file->f_mapping, file, start, end - start);
145    return 0;
146}
147
148/*
149 * Application no longer needs these pages. If the pages are dirty,
150 * it's OK to just throw them away. The app will be more careful about
151 * data it wants to keep. Be sure to free swap resources too. The
152 * zap_page_range call sets things up for shrink_active_list to actually free
153 * these pages later if no one else has touched them in the meantime,
154 * although we could add these pages to a global reuse list for
155 * shrink_active_list to pick up before reclaiming other pages.
156 *
157 * NB: This interface discards data rather than pushes it out to swap,
158 * as some implementations do. This has performance implications for
159 * applications like large transactional databases which want to discard
160 * pages in anonymous maps after committing to backing store the data
161 * that was kept in them. There is no reason to write this data out to
162 * the swap area if the application is discarding it.
163 *
164 * An interface that causes the system to free clean pages and flush
165 * dirty pages is already available as msync(MS_INVALIDATE).
166 */
167static long madvise_dontneed(struct vm_area_struct * vma,
168                 struct vm_area_struct ** prev,
169                 unsigned long start, unsigned long end)
170{
171    *prev = vma;
172    if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
173        return -EINVAL;
174
175    if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
176        struct zap_details details = {
177            .nonlinear_vma = vma,
178            .last_index = ULONG_MAX,
179        };
180        zap_page_range(vma, start, end - start, &details);
181    } else
182        zap_page_range(vma, start, end - start, NULL);
183    return 0;
184}
185
186/*
187 * Application wants to free up the pages and associated backing store.
188 * This is effectively punching a hole into the middle of a file.
189 *
190 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
191 * Other filesystems return -ENOSYS.
192 */
193static long madvise_remove(struct vm_area_struct *vma,
194                struct vm_area_struct **prev,
195                unsigned long start, unsigned long end)
196{
197    struct address_space *mapping;
198    loff_t offset, endoff;
199    int error;
200
201    *prev = NULL; /* tell sys_madvise we drop mmap_sem */
202
203    if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
204        return -EINVAL;
205
206    if (!vma->vm_file || !vma->vm_file->f_mapping
207        || !vma->vm_file->f_mapping->host) {
208            return -EINVAL;
209    }
210
211    if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
212        return -EACCES;
213
214    mapping = vma->vm_file->f_mapping;
215
216    offset = (loff_t)(start - vma->vm_start)
217            + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
218    endoff = (loff_t)(end - vma->vm_start - 1)
219            + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220
221    /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
222    up_read(&current->mm->mmap_sem);
223    error = vmtruncate_range(mapping->host, offset, endoff);
224    down_read(&current->mm->mmap_sem);
225    return error;
226}
227
228#ifdef CONFIG_MEMORY_FAILURE
229/*
230 * Error injection support for memory error handling.
231 */
232static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
233{
234    int ret = 0;
235
236    if (!capable(CAP_SYS_ADMIN))
237        return -EPERM;
238    for (; start < end; start += PAGE_SIZE) {
239        struct page *p;
240        int ret = get_user_pages_fast(start, 1, 0, &p);
241        if (ret != 1)
242            return ret;
243        if (bhv == MADV_SOFT_OFFLINE) {
244            printk(KERN_INFO "Soft offlining page %lx at %lx\n",
245                page_to_pfn(p), start);
246            ret = soft_offline_page(p, MF_COUNT_INCREASED);
247            if (ret)
248                break;
249            continue;
250        }
251        printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
252               page_to_pfn(p), start);
253        /* Ignore return value for now */
254        __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
255    }
256    return ret;
257}
258#endif
259
260static long
261madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
262        unsigned long start, unsigned long end, int behavior)
263{
264    switch (behavior) {
265    case MADV_REMOVE:
266        return madvise_remove(vma, prev, start, end);
267    case MADV_WILLNEED:
268        return madvise_willneed(vma, prev, start, end);
269    case MADV_DONTNEED:
270        return madvise_dontneed(vma, prev, start, end);
271    default:
272        return madvise_behavior(vma, prev, start, end, behavior);
273    }
274}
275
276static int
277madvise_behavior_valid(int behavior)
278{
279    switch (behavior) {
280    case MADV_DOFORK:
281    case MADV_DONTFORK:
282    case MADV_NORMAL:
283    case MADV_SEQUENTIAL:
284    case MADV_RANDOM:
285    case MADV_REMOVE:
286    case MADV_WILLNEED:
287    case MADV_DONTNEED:
288#ifdef CONFIG_KSM
289    case MADV_MERGEABLE:
290    case MADV_UNMERGEABLE:
291#endif
292#ifdef CONFIG_TRANSPARENT_HUGEPAGE
293    case MADV_HUGEPAGE:
294    case MADV_NOHUGEPAGE:
295#endif
296        return 1;
297
298    default:
299        return 0;
300    }
301}
302
303/*
304 * The madvise(2) system call.
305 *
306 * Applications can use madvise() to advise the kernel how it should
307 * handle paging I/O in this VM area. The idea is to help the kernel
308 * use appropriate read-ahead and caching techniques. The information
309 * provided is advisory only, and can be safely disregarded by the
310 * kernel without affecting the correct operation of the application.
311 *
312 * behavior values:
313 * MADV_NORMAL - the default behavior is to read clusters. This
314 * results in some read-ahead and read-behind.
315 * MADV_RANDOM - the system should read the minimum amount of data
316 * on any access, since it is unlikely that the appli-
317 * cation will need more than what it asks for.
318 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
319 * once, so they can be aggressively read ahead, and
320 * can be freed soon after they are accessed.
321 * MADV_WILLNEED - the application is notifying the system to read
322 * some pages ahead.
323 * MADV_DONTNEED - the application is finished with the given range,
324 * so the kernel can free resources associated with it.
325 * MADV_REMOVE - the application wants to free up the given range of
326 * pages and associated backing store.
327 * MADV_DONTFORK - omit this area from child's address space when forking:
328 * typically, to avoid COWing pages pinned by get_user_pages().
329 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
330 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
331 * this area with pages of identical content from other such areas.
332 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
333 *
334 * return values:
335 * zero - success
336 * -EINVAL - start + len < 0, start is not page-aligned,
337 * "behavior" is not a valid value, or application
338 * is attempting to release locked or shared pages.
339 * -ENOMEM - addresses in the specified range are not currently
340 * mapped, or are outside the AS of the process.
341 * -EIO - an I/O error occurred while paging in data.
342 * -EBADF - map exists, but area maps something that isn't a file.
343 * -EAGAIN - a kernel resource was temporarily unavailable.
344 */
345SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
346{
347    unsigned long end, tmp;
348    struct vm_area_struct * vma, *prev;
349    int unmapped_error = 0;
350    int error = -EINVAL;
351    int write;
352    size_t len;
353
354#ifdef CONFIG_MEMORY_FAILURE
355    if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
356        return madvise_hwpoison(behavior, start, start+len_in);
357#endif
358    if (!madvise_behavior_valid(behavior))
359        return error;
360
361    write = madvise_need_mmap_write(behavior);
362    if (write)
363        down_write(&current->mm->mmap_sem);
364    else
365        down_read(&current->mm->mmap_sem);
366
367    if (start & ~PAGE_MASK)
368        goto out;
369    len = (len_in + ~PAGE_MASK) & PAGE_MASK;
370
371    /* Check to see whether len was rounded up from small -ve to zero */
372    if (len_in && !len)
373        goto out;
374
375    end = start + len;
376    if (end < start)
377        goto out;
378
379    error = 0;
380    if (end == start)
381        goto out;
382
383    /*
384     * If the interval [start,end) covers some unmapped address
385     * ranges, just ignore them, but return -ENOMEM at the end.
386     * - different from the way of handling in mlock etc.
387     */
388    vma = find_vma_prev(current->mm, start, &prev);
389    if (vma && start > vma->vm_start)
390        prev = vma;
391
392    for (;;) {
393        /* Still start < end. */
394        error = -ENOMEM;
395        if (!vma)
396            goto out;
397
398        /* Here start < (end|vma->vm_end). */
399        if (start < vma->vm_start) {
400            unmapped_error = -ENOMEM;
401            start = vma->vm_start;
402            if (start >= end)
403                goto out;
404        }
405
406        /* Here vma->vm_start <= start < (end|vma->vm_end) */
407        tmp = vma->vm_end;
408        if (end < tmp)
409            tmp = end;
410
411        /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
412        error = madvise_vma(vma, &prev, start, tmp, behavior);
413        if (error)
414            goto out;
415        start = tmp;
416        if (prev && start < prev->vm_end)
417            start = prev->vm_end;
418        error = unmapped_error;
419        if (start >= end)
420            goto out;
421        if (prev)
422            vma = prev->vm_next;
423        else /* madvise_remove dropped mmap_sem */
424            vma = find_vma(current->mm, start);
425    }
426out:
427    if (write)
428        up_write(&current->mm->mmap_sem);
429    else
430        up_read(&current->mm->mmap_sem);
431
432    return error;
433}
434

Archive Download this file



interactive