Root/
1 | /* |
2 | * linux/mm/mlock.c |
3 | * |
4 | * (C) Copyright 1995 Linus Torvalds |
5 | * (C) Copyright 2002 Christoph Hellwig |
6 | */ |
7 | |
8 | #include <linux/capability.h> |
9 | #include <linux/mman.h> |
10 | #include <linux/mm.h> |
11 | #include <linux/swap.h> |
12 | #include <linux/swapops.h> |
13 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> |
15 | #include <linux/syscalls.h> |
16 | #include <linux/sched.h> |
17 | #include <linux/export.h> |
18 | #include <linux/rmap.h> |
19 | #include <linux/mmzone.h> |
20 | #include <linux/hugetlb.h> |
21 | |
22 | #include "internal.h" |
23 | |
24 | int can_do_mlock(void) |
25 | { |
26 | if (capable(CAP_IPC_LOCK)) |
27 | return 1; |
28 | if (rlimit(RLIMIT_MEMLOCK) != 0) |
29 | return 1; |
30 | return 0; |
31 | } |
32 | EXPORT_SYMBOL(can_do_mlock); |
33 | |
34 | /* |
35 | * Mlocked pages are marked with PageMlocked() flag for efficient testing |
36 | * in vmscan and, possibly, the fault path; and to support semi-accurate |
37 | * statistics. |
38 | * |
39 | * An mlocked page [PageMlocked(page)] is unevictable. As such, it will |
40 | * be placed on the LRU "unevictable" list, rather than the [in]active lists. |
41 | * The unevictable list is an LRU sibling list to the [in]active lists. |
42 | * PageUnevictable is set to indicate the unevictable state. |
43 | * |
44 | * When lazy mlocking via vmscan, it is important to ensure that the |
45 | * vma's VM_LOCKED status is not concurrently being modified, otherwise we |
46 | * may have mlocked a page that is being munlocked. So lazy mlock must take |
47 | * the mmap_sem for read, and verify that the vma really is locked |
48 | * (see mm/rmap.c). |
49 | */ |
50 | |
51 | /* |
52 | * LRU accounting for clear_page_mlock() |
53 | */ |
54 | void clear_page_mlock(struct page *page) |
55 | { |
56 | if (!TestClearPageMlocked(page)) |
57 | return; |
58 | |
59 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
60 | -hpage_nr_pages(page)); |
61 | count_vm_event(UNEVICTABLE_PGCLEARED); |
62 | if (!isolate_lru_page(page)) { |
63 | putback_lru_page(page); |
64 | } else { |
65 | /* |
66 | * We lost the race. the page already moved to evictable list. |
67 | */ |
68 | if (PageUnevictable(page)) |
69 | count_vm_event(UNEVICTABLE_PGSTRANDED); |
70 | } |
71 | } |
72 | |
73 | /* |
74 | * Mark page as mlocked if not already. |
75 | * If page on LRU, isolate and putback to move to unevictable list. |
76 | */ |
77 | void mlock_vma_page(struct page *page) |
78 | { |
79 | BUG_ON(!PageLocked(page)); |
80 | |
81 | if (!TestSetPageMlocked(page)) { |
82 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
83 | hpage_nr_pages(page)); |
84 | count_vm_event(UNEVICTABLE_PGMLOCKED); |
85 | if (!isolate_lru_page(page)) |
86 | putback_lru_page(page); |
87 | } |
88 | } |
89 | |
90 | /** |
91 | * munlock_vma_page - munlock a vma page |
92 | * @page - page to be unlocked |
93 | * |
94 | * called from munlock()/munmap() path with page supposedly on the LRU. |
95 | * When we munlock a page, because the vma where we found the page is being |
96 | * munlock()ed or munmap()ed, we want to check whether other vmas hold the |
97 | * page locked so that we can leave it on the unevictable lru list and not |
98 | * bother vmscan with it. However, to walk the page's rmap list in |
99 | * try_to_munlock() we must isolate the page from the LRU. If some other |
100 | * task has removed the page from the LRU, we won't be able to do that. |
101 | * So we clear the PageMlocked as we might not get another chance. If we |
102 | * can't isolate the page, we leave it for putback_lru_page() and vmscan |
103 | * [page_referenced()/try_to_unmap()] to deal with. |
104 | */ |
105 | unsigned int munlock_vma_page(struct page *page) |
106 | { |
107 | unsigned int page_mask = 0; |
108 | |
109 | BUG_ON(!PageLocked(page)); |
110 | |
111 | if (TestClearPageMlocked(page)) { |
112 | unsigned int nr_pages = hpage_nr_pages(page); |
113 | mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
114 | page_mask = nr_pages - 1; |
115 | if (!isolate_lru_page(page)) { |
116 | int ret = SWAP_AGAIN; |
117 | |
118 | /* |
119 | * Optimization: if the page was mapped just once, |
120 | * that's our mapping and we don't need to check all the |
121 | * other vmas. |
122 | */ |
123 | if (page_mapcount(page) > 1) |
124 | ret = try_to_munlock(page); |
125 | /* |
126 | * did try_to_unlock() succeed or punt? |
127 | */ |
128 | if (ret != SWAP_MLOCK) |
129 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
130 | |
131 | putback_lru_page(page); |
132 | } else { |
133 | /* |
134 | * Some other task has removed the page from the LRU. |
135 | * putback_lru_page() will take care of removing the |
136 | * page from the unevictable list, if necessary. |
137 | * vmscan [page_referenced()] will move the page back |
138 | * to the unevictable list if some other vma has it |
139 | * mlocked. |
140 | */ |
141 | if (PageUnevictable(page)) |
142 | count_vm_event(UNEVICTABLE_PGSTRANDED); |
143 | else |
144 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
145 | } |
146 | } |
147 | |
148 | return page_mask; |
149 | } |
150 | |
151 | /** |
152 | * __mlock_vma_pages_range() - mlock a range of pages in the vma. |
153 | * @vma: target vma |
154 | * @start: start address |
155 | * @end: end address |
156 | * |
157 | * This takes care of making the pages present too. |
158 | * |
159 | * return 0 on success, negative error code on error. |
160 | * |
161 | * vma->vm_mm->mmap_sem must be held for at least read. |
162 | */ |
163 | long __mlock_vma_pages_range(struct vm_area_struct *vma, |
164 | unsigned long start, unsigned long end, int *nonblocking) |
165 | { |
166 | struct mm_struct *mm = vma->vm_mm; |
167 | unsigned long nr_pages = (end - start) / PAGE_SIZE; |
168 | int gup_flags; |
169 | |
170 | VM_BUG_ON(start & ~PAGE_MASK); |
171 | VM_BUG_ON(end & ~PAGE_MASK); |
172 | VM_BUG_ON(start < vma->vm_start); |
173 | VM_BUG_ON(end > vma->vm_end); |
174 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
175 | |
176 | gup_flags = FOLL_TOUCH | FOLL_MLOCK; |
177 | /* |
178 | * We want to touch writable mappings with a write fault in order |
179 | * to break COW, except for shared mappings because these don't COW |
180 | * and we would not want to dirty them for nothing. |
181 | */ |
182 | if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) |
183 | gup_flags |= FOLL_WRITE; |
184 | |
185 | /* |
186 | * We want mlock to succeed for regions that have any permissions |
187 | * other than PROT_NONE. |
188 | */ |
189 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) |
190 | gup_flags |= FOLL_FORCE; |
191 | |
192 | /* |
193 | * We made sure addr is within a VMA, so the following will |
194 | * not result in a stack expansion that recurses back here. |
195 | */ |
196 | return __get_user_pages(current, mm, start, nr_pages, gup_flags, |
197 | NULL, NULL, nonblocking); |
198 | } |
199 | |
200 | /* |
201 | * convert get_user_pages() return value to posix mlock() error |
202 | */ |
203 | static int __mlock_posix_error_return(long retval) |
204 | { |
205 | if (retval == -EFAULT) |
206 | retval = -ENOMEM; |
207 | else if (retval == -ENOMEM) |
208 | retval = -EAGAIN; |
209 | return retval; |
210 | } |
211 | |
212 | /* |
213 | * munlock_vma_pages_range() - munlock all pages in the vma range.' |
214 | * @vma - vma containing range to be munlock()ed. |
215 | * @start - start address in @vma of the range |
216 | * @end - end of range in @vma. |
217 | * |
218 | * For mremap(), munmap() and exit(). |
219 | * |
220 | * Called with @vma VM_LOCKED. |
221 | * |
222 | * Returns with VM_LOCKED cleared. Callers must be prepared to |
223 | * deal with this. |
224 | * |
225 | * We don't save and restore VM_LOCKED here because pages are |
226 | * still on lru. In unmap path, pages might be scanned by reclaim |
227 | * and re-mlocked by try_to_{munlock|unmap} before we unmap and |
228 | * free them. This will result in freeing mlocked pages. |
229 | */ |
230 | void munlock_vma_pages_range(struct vm_area_struct *vma, |
231 | unsigned long start, unsigned long end) |
232 | { |
233 | vma->vm_flags &= ~VM_LOCKED; |
234 | |
235 | while (start < end) { |
236 | struct page *page; |
237 | unsigned int page_mask, page_increm; |
238 | |
239 | /* |
240 | * Although FOLL_DUMP is intended for get_dump_page(), |
241 | * it just so happens that its special treatment of the |
242 | * ZERO_PAGE (returning an error instead of doing get_page) |
243 | * suits munlock very well (and if somehow an abnormal page |
244 | * has sneaked into the range, we won't oops here: great). |
245 | */ |
246 | page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, |
247 | &page_mask); |
248 | if (page && !IS_ERR(page)) { |
249 | lock_page(page); |
250 | lru_add_drain(); |
251 | /* |
252 | * Any THP page found by follow_page_mask() may have |
253 | * gotten split before reaching munlock_vma_page(), |
254 | * so we need to recompute the page_mask here. |
255 | */ |
256 | page_mask = munlock_vma_page(page); |
257 | unlock_page(page); |
258 | put_page(page); |
259 | } |
260 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); |
261 | start += page_increm * PAGE_SIZE; |
262 | cond_resched(); |
263 | } |
264 | } |
265 | |
266 | /* |
267 | * mlock_fixup - handle mlock[all]/munlock[all] requests. |
268 | * |
269 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and |
270 | * munlock is a no-op. However, for some special vmas, we go ahead and |
271 | * populate the ptes. |
272 | * |
273 | * For vmas that pass the filters, merge/split as appropriate. |
274 | */ |
275 | static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, |
276 | unsigned long start, unsigned long end, vm_flags_t newflags) |
277 | { |
278 | struct mm_struct *mm = vma->vm_mm; |
279 | pgoff_t pgoff; |
280 | int nr_pages; |
281 | int ret = 0; |
282 | int lock = !!(newflags & VM_LOCKED); |
283 | |
284 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
285 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) |
286 | goto out; /* don't set VM_LOCKED, don't count */ |
287 | |
288 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
289 | *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, |
290 | vma->vm_file, pgoff, vma_policy(vma)); |
291 | if (*prev) { |
292 | vma = *prev; |
293 | goto success; |
294 | } |
295 | |
296 | if (start != vma->vm_start) { |
297 | ret = split_vma(mm, vma, start, 1); |
298 | if (ret) |
299 | goto out; |
300 | } |
301 | |
302 | if (end != vma->vm_end) { |
303 | ret = split_vma(mm, vma, end, 0); |
304 | if (ret) |
305 | goto out; |
306 | } |
307 | |
308 | success: |
309 | /* |
310 | * Keep track of amount of locked VM. |
311 | */ |
312 | nr_pages = (end - start) >> PAGE_SHIFT; |
313 | if (!lock) |
314 | nr_pages = -nr_pages; |
315 | mm->locked_vm += nr_pages; |
316 | |
317 | /* |
318 | * vm_flags is protected by the mmap_sem held in write mode. |
319 | * It's okay if try_to_unmap_one unmaps a page just after we |
320 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. |
321 | */ |
322 | |
323 | if (lock) |
324 | vma->vm_flags = newflags; |
325 | else |
326 | munlock_vma_pages_range(vma, start, end); |
327 | |
328 | out: |
329 | *prev = vma; |
330 | return ret; |
331 | } |
332 | |
333 | static int do_mlock(unsigned long start, size_t len, int on) |
334 | { |
335 | unsigned long nstart, end, tmp; |
336 | struct vm_area_struct * vma, * prev; |
337 | int error; |
338 | |
339 | VM_BUG_ON(start & ~PAGE_MASK); |
340 | VM_BUG_ON(len != PAGE_ALIGN(len)); |
341 | end = start + len; |
342 | if (end < start) |
343 | return -EINVAL; |
344 | if (end == start) |
345 | return 0; |
346 | vma = find_vma(current->mm, start); |
347 | if (!vma || vma->vm_start > start) |
348 | return -ENOMEM; |
349 | |
350 | prev = vma->vm_prev; |
351 | if (start > vma->vm_start) |
352 | prev = vma; |
353 | |
354 | for (nstart = start ; ; ) { |
355 | vm_flags_t newflags; |
356 | |
357 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
358 | |
359 | newflags = vma->vm_flags & ~VM_LOCKED; |
360 | if (on) |
361 | newflags |= VM_LOCKED; |
362 | |
363 | tmp = vma->vm_end; |
364 | if (tmp > end) |
365 | tmp = end; |
366 | error = mlock_fixup(vma, &prev, nstart, tmp, newflags); |
367 | if (error) |
368 | break; |
369 | nstart = tmp; |
370 | if (nstart < prev->vm_end) |
371 | nstart = prev->vm_end; |
372 | if (nstart >= end) |
373 | break; |
374 | |
375 | vma = prev->vm_next; |
376 | if (!vma || vma->vm_start != nstart) { |
377 | error = -ENOMEM; |
378 | break; |
379 | } |
380 | } |
381 | return error; |
382 | } |
383 | |
384 | /* |
385 | * __mm_populate - populate and/or mlock pages within a range of address space. |
386 | * |
387 | * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap |
388 | * flags. VMAs must be already marked with the desired vm_flags, and |
389 | * mmap_sem must not be held. |
390 | */ |
391 | int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) |
392 | { |
393 | struct mm_struct *mm = current->mm; |
394 | unsigned long end, nstart, nend; |
395 | struct vm_area_struct *vma = NULL; |
396 | int locked = 0; |
397 | long ret = 0; |
398 | |
399 | VM_BUG_ON(start & ~PAGE_MASK); |
400 | VM_BUG_ON(len != PAGE_ALIGN(len)); |
401 | end = start + len; |
402 | |
403 | for (nstart = start; nstart < end; nstart = nend) { |
404 | /* |
405 | * We want to fault in pages for [nstart; end) address range. |
406 | * Find first corresponding VMA. |
407 | */ |
408 | if (!locked) { |
409 | locked = 1; |
410 | down_read(&mm->mmap_sem); |
411 | vma = find_vma(mm, nstart); |
412 | } else if (nstart >= vma->vm_end) |
413 | vma = vma->vm_next; |
414 | if (!vma || vma->vm_start >= end) |
415 | break; |
416 | /* |
417 | * Set [nstart; nend) to intersection of desired address |
418 | * range with the first VMA. Also, skip undesirable VMA types. |
419 | */ |
420 | nend = min(end, vma->vm_end); |
421 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) |
422 | continue; |
423 | if (nstart < vma->vm_start) |
424 | nstart = vma->vm_start; |
425 | /* |
426 | * Now fault in a range of pages. __mlock_vma_pages_range() |
427 | * double checks the vma flags, so that it won't mlock pages |
428 | * if the vma was already munlocked. |
429 | */ |
430 | ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); |
431 | if (ret < 0) { |
432 | if (ignore_errors) { |
433 | ret = 0; |
434 | continue; /* continue at next VMA */ |
435 | } |
436 | ret = __mlock_posix_error_return(ret); |
437 | break; |
438 | } |
439 | nend = nstart + ret * PAGE_SIZE; |
440 | ret = 0; |
441 | } |
442 | if (locked) |
443 | up_read(&mm->mmap_sem); |
444 | return ret; /* 0 or negative error code */ |
445 | } |
446 | |
447 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) |
448 | { |
449 | unsigned long locked; |
450 | unsigned long lock_limit; |
451 | int error = -ENOMEM; |
452 | |
453 | if (!can_do_mlock()) |
454 | return -EPERM; |
455 | |
456 | lru_add_drain_all(); /* flush pagevec */ |
457 | |
458 | down_write(¤t->mm->mmap_sem); |
459 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
460 | start &= PAGE_MASK; |
461 | |
462 | locked = len >> PAGE_SHIFT; |
463 | locked += current->mm->locked_vm; |
464 | |
465 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
466 | lock_limit >>= PAGE_SHIFT; |
467 | |
468 | /* check against resource limits */ |
469 | if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) |
470 | error = do_mlock(start, len, 1); |
471 | up_write(¤t->mm->mmap_sem); |
472 | if (!error) |
473 | error = __mm_populate(start, len, 0); |
474 | return error; |
475 | } |
476 | |
477 | SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) |
478 | { |
479 | int ret; |
480 | |
481 | down_write(¤t->mm->mmap_sem); |
482 | len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); |
483 | start &= PAGE_MASK; |
484 | ret = do_mlock(start, len, 0); |
485 | up_write(¤t->mm->mmap_sem); |
486 | return ret; |
487 | } |
488 | |
489 | static int do_mlockall(int flags) |
490 | { |
491 | struct vm_area_struct * vma, * prev = NULL; |
492 | |
493 | if (flags & MCL_FUTURE) |
494 | current->mm->def_flags |= VM_LOCKED; |
495 | else |
496 | current->mm->def_flags &= ~VM_LOCKED; |
497 | if (flags == MCL_FUTURE) |
498 | goto out; |
499 | |
500 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { |
501 | vm_flags_t newflags; |
502 | |
503 | newflags = vma->vm_flags & ~VM_LOCKED; |
504 | if (flags & MCL_CURRENT) |
505 | newflags |= VM_LOCKED; |
506 | |
507 | /* Ignore errors */ |
508 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); |
509 | } |
510 | out: |
511 | return 0; |
512 | } |
513 | |
514 | SYSCALL_DEFINE1(mlockall, int, flags) |
515 | { |
516 | unsigned long lock_limit; |
517 | int ret = -EINVAL; |
518 | |
519 | if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) |
520 | goto out; |
521 | |
522 | ret = -EPERM; |
523 | if (!can_do_mlock()) |
524 | goto out; |
525 | |
526 | if (flags & MCL_CURRENT) |
527 | lru_add_drain_all(); /* flush pagevec */ |
528 | |
529 | down_write(¤t->mm->mmap_sem); |
530 | |
531 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
532 | lock_limit >>= PAGE_SHIFT; |
533 | |
534 | ret = -ENOMEM; |
535 | if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || |
536 | capable(CAP_IPC_LOCK)) |
537 | ret = do_mlockall(flags); |
538 | up_write(¤t->mm->mmap_sem); |
539 | if (!ret && (flags & MCL_CURRENT)) |
540 | mm_populate(0, TASK_SIZE); |
541 | out: |
542 | return ret; |
543 | } |
544 | |
545 | SYSCALL_DEFINE0(munlockall) |
546 | { |
547 | int ret; |
548 | |
549 | down_write(¤t->mm->mmap_sem); |
550 | ret = do_mlockall(0); |
551 | up_write(¤t->mm->mmap_sem); |
552 | return ret; |
553 | } |
554 | |
555 | /* |
556 | * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB |
557 | * shm segments) get accounted against the user_struct instead. |
558 | */ |
559 | static DEFINE_SPINLOCK(shmlock_user_lock); |
560 | |
561 | int user_shm_lock(size_t size, struct user_struct *user) |
562 | { |
563 | unsigned long lock_limit, locked; |
564 | int allowed = 0; |
565 | |
566 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
567 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
568 | if (lock_limit == RLIM_INFINITY) |
569 | allowed = 1; |
570 | lock_limit >>= PAGE_SHIFT; |
571 | spin_lock(&shmlock_user_lock); |
572 | if (!allowed && |
573 | locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) |
574 | goto out; |
575 | get_uid(user); |
576 | user->locked_shm += locked; |
577 | allowed = 1; |
578 | out: |
579 | spin_unlock(&shmlock_user_lock); |
580 | return allowed; |
581 | } |
582 | |
583 | void user_shm_unlock(size_t size, struct user_struct *user) |
584 | { |
585 | spin_lock(&shmlock_user_lock); |
586 | user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
587 | spin_unlock(&shmlock_user_lock); |
588 | free_uid(user); |
589 | } |
590 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9