Root/mm/shmem.c

1/*
2 * Resizable virtual memory filesystem for Linux.
3 *
4 * Copyright (C) 2000 Linus Torvalds.
5 * 2000 Transmeta Corp.
6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 *
13 * Extended attribute support for tmpfs:
14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16 *
17 * tiny-shmem:
18 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
19 *
20 * This file is released under the GPL.
21 */
22
23#include <linux/fs.h>
24#include <linux/init.h>
25#include <linux/vfs.h>
26#include <linux/mount.h>
27#include <linux/pagemap.h>
28#include <linux/file.h>
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/percpu_counter.h>
32#include <linux/swap.h>
33
34static struct vfsmount *shm_mnt;
35
36#ifdef CONFIG_SHMEM
37/*
38 * This virtual memory filesystem is heavily based on the ramfs. It
39 * extends ramfs by the ability to use swap and honor resource limits
40 * which makes it a completely usable filesystem.
41 */
42
43#include <linux/xattr.h>
44#include <linux/exportfs.h>
45#include <linux/posix_acl.h>
46#include <linux/generic_acl.h>
47#include <linux/mman.h>
48#include <linux/string.h>
49#include <linux/slab.h>
50#include <linux/backing-dev.h>
51#include <linux/shmem_fs.h>
52#include <linux/writeback.h>
53#include <linux/blkdev.h>
54#include <linux/security.h>
55#include <linux/swapops.h>
56#include <linux/mempolicy.h>
57#include <linux/namei.h>
58#include <linux/ctype.h>
59#include <linux/migrate.h>
60#include <linux/highmem.h>
61#include <linux/seq_file.h>
62#include <linux/magic.h>
63
64#include <asm/uaccess.h>
65#include <asm/div64.h>
66#include <asm/pgtable.h>
67
68/*
69 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
70 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
71 *
72 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
73 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
74 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
75 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
76 *
77 * We use / and * instead of shifts in the definitions below, so that the swap
78 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
79 */
80#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
81#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
82
83#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
84#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
85
86#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
87#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
88
89#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
90#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
91
92/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
93#define SHMEM_PAGEIN VM_READ
94#define SHMEM_TRUNCATE VM_WRITE
95
96/* Definition to limit shmem_truncate's steps between cond_rescheds */
97#define LATENCY_LIMIT 64
98
99/* Pretend that each entry is of this size in directory's i_size */
100#define BOGO_DIRENT_SIZE 20
101
102struct shmem_xattr {
103    struct list_head list; /* anchored by shmem_inode_info->xattr_list */
104    char *name; /* xattr name */
105    size_t size;
106    char value[0];
107};
108
109/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
110enum sgp_type {
111    SGP_READ, /* don't exceed i_size, don't allocate page */
112    SGP_CACHE, /* don't exceed i_size, may allocate page */
113    SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
114    SGP_WRITE, /* may exceed i_size, may allocate page */
115};
116
117#ifdef CONFIG_TMPFS
118static unsigned long shmem_default_max_blocks(void)
119{
120    return totalram_pages / 2;
121}
122
123static unsigned long shmem_default_max_inodes(void)
124{
125    return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
126}
127#endif
128
129static int shmem_getpage(struct inode *inode, unsigned long idx,
130             struct page **pagep, enum sgp_type sgp, int *type);
131
132static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
133{
134    /*
135     * The above definition of ENTRIES_PER_PAGE, and the use of
136     * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
137     * might be reconsidered if it ever diverges from PAGE_SIZE.
138     *
139     * Mobility flags are masked out as swap vectors cannot move
140     */
141    return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
142                PAGE_CACHE_SHIFT-PAGE_SHIFT);
143}
144
145static inline void shmem_dir_free(struct page *page)
146{
147    __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
148}
149
150static struct page **shmem_dir_map(struct page *page)
151{
152    return (struct page **)kmap_atomic(page, KM_USER0);
153}
154
155static inline void shmem_dir_unmap(struct page **dir)
156{
157    kunmap_atomic(dir, KM_USER0);
158}
159
160static swp_entry_t *shmem_swp_map(struct page *page)
161{
162    return (swp_entry_t *)kmap_atomic(page, KM_USER1);
163}
164
165static inline void shmem_swp_balance_unmap(void)
166{
167    /*
168     * When passing a pointer to an i_direct entry, to code which
169     * also handles indirect entries and so will shmem_swp_unmap,
170     * we must arrange for the preempt count to remain in balance.
171     * What kmap_atomic of a lowmem page does depends on config
172     * and architecture, so pretend to kmap_atomic some lowmem page.
173     */
174    (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
175}
176
177static inline void shmem_swp_unmap(swp_entry_t *entry)
178{
179    kunmap_atomic(entry, KM_USER1);
180}
181
182static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
183{
184    return sb->s_fs_info;
185}
186
187/*
188 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
189 * for shared memory and for shared anonymous (/dev/zero) mappings
190 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
191 * consistent with the pre-accounting of private mappings ...
192 */
193static inline int shmem_acct_size(unsigned long flags, loff_t size)
194{
195    return (flags & VM_NORESERVE) ?
196        0 : security_vm_enough_memory_kern(VM_ACCT(size));
197}
198
199static inline void shmem_unacct_size(unsigned long flags, loff_t size)
200{
201    if (!(flags & VM_NORESERVE))
202        vm_unacct_memory(VM_ACCT(size));
203}
204
205/*
206 * ... whereas tmpfs objects are accounted incrementally as
207 * pages are allocated, in order to allow huge sparse files.
208 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
209 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
210 */
211static inline int shmem_acct_block(unsigned long flags)
212{
213    return (flags & VM_NORESERVE) ?
214        security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0;
215}
216
217static inline void shmem_unacct_blocks(unsigned long flags, long pages)
218{
219    if (flags & VM_NORESERVE)
220        vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
221}
222
223static const struct super_operations shmem_ops;
224static const struct address_space_operations shmem_aops;
225static const struct file_operations shmem_file_operations;
226static const struct inode_operations shmem_inode_operations;
227static const struct inode_operations shmem_dir_inode_operations;
228static const struct inode_operations shmem_special_inode_operations;
229static const struct vm_operations_struct shmem_vm_ops;
230
231static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
232    .ra_pages = 0, /* No readahead */
233    .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
234};
235
236static LIST_HEAD(shmem_swaplist);
237static DEFINE_MUTEX(shmem_swaplist_mutex);
238
239static void shmem_free_blocks(struct inode *inode, long pages)
240{
241    struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
242    if (sbinfo->max_blocks) {
243        percpu_counter_add(&sbinfo->used_blocks, -pages);
244        spin_lock(&inode->i_lock);
245        inode->i_blocks -= pages*BLOCKS_PER_PAGE;
246        spin_unlock(&inode->i_lock);
247    }
248}
249
250static int shmem_reserve_inode(struct super_block *sb)
251{
252    struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
253    if (sbinfo->max_inodes) {
254        spin_lock(&sbinfo->stat_lock);
255        if (!sbinfo->free_inodes) {
256            spin_unlock(&sbinfo->stat_lock);
257            return -ENOSPC;
258        }
259        sbinfo->free_inodes--;
260        spin_unlock(&sbinfo->stat_lock);
261    }
262    return 0;
263}
264
265static void shmem_free_inode(struct super_block *sb)
266{
267    struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
268    if (sbinfo->max_inodes) {
269        spin_lock(&sbinfo->stat_lock);
270        sbinfo->free_inodes++;
271        spin_unlock(&sbinfo->stat_lock);
272    }
273}
274
275/**
276 * shmem_recalc_inode - recalculate the size of an inode
277 * @inode: inode to recalc
278 *
279 * We have to calculate the free blocks since the mm can drop
280 * undirtied hole pages behind our back.
281 *
282 * But normally info->alloced == inode->i_mapping->nrpages + info->swapped
283 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
284 *
285 * It has to be called with the spinlock held.
286 */
287static void shmem_recalc_inode(struct inode *inode)
288{
289    struct shmem_inode_info *info = SHMEM_I(inode);
290    long freed;
291
292    freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
293    if (freed > 0) {
294        info->alloced -= freed;
295        shmem_unacct_blocks(info->flags, freed);
296        shmem_free_blocks(inode, freed);
297    }
298}
299
300/**
301 * shmem_swp_entry - find the swap vector position in the info structure
302 * @info: info structure for the inode
303 * @index: index of the page to find
304 * @page: optional page to add to the structure. Has to be preset to
305 * all zeros
306 *
307 * If there is no space allocated yet it will return NULL when
308 * page is NULL, else it will use the page for the needed block,
309 * setting it to NULL on return to indicate that it has been used.
310 *
311 * The swap vector is organized the following way:
312 *
313 * There are SHMEM_NR_DIRECT entries directly stored in the
314 * shmem_inode_info structure. So small files do not need an addional
315 * allocation.
316 *
317 * For pages with index > SHMEM_NR_DIRECT there is the pointer
318 * i_indirect which points to a page which holds in the first half
319 * doubly indirect blocks, in the second half triple indirect blocks:
320 *
321 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
322 * following layout (for SHMEM_NR_DIRECT == 16):
323 *
324 * i_indirect -> dir --> 16-19
325 * | +-> 20-23
326 * |
327 * +-->dir2 --> 24-27
328 * | +-> 28-31
329 * | +-> 32-35
330 * | +-> 36-39
331 * |
332 * +-->dir3 --> 40-43
333 * +-> 44-47
334 * +-> 48-51
335 * +-> 52-55
336 */
337static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
338{
339    unsigned long offset;
340    struct page **dir;
341    struct page *subdir;
342
343    if (index < SHMEM_NR_DIRECT) {
344        shmem_swp_balance_unmap();
345        return info->i_direct+index;
346    }
347    if (!info->i_indirect) {
348        if (page) {
349            info->i_indirect = *page;
350            *page = NULL;
351        }
352        return NULL; /* need another page */
353    }
354
355    index -= SHMEM_NR_DIRECT;
356    offset = index % ENTRIES_PER_PAGE;
357    index /= ENTRIES_PER_PAGE;
358    dir = shmem_dir_map(info->i_indirect);
359
360    if (index >= ENTRIES_PER_PAGE/2) {
361        index -= ENTRIES_PER_PAGE/2;
362        dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
363        index %= ENTRIES_PER_PAGE;
364        subdir = *dir;
365        if (!subdir) {
366            if (page) {
367                *dir = *page;
368                *page = NULL;
369            }
370            shmem_dir_unmap(dir);
371            return NULL; /* need another page */
372        }
373        shmem_dir_unmap(dir);
374        dir = shmem_dir_map(subdir);
375    }
376
377    dir += index;
378    subdir = *dir;
379    if (!subdir) {
380        if (!page || !(subdir = *page)) {
381            shmem_dir_unmap(dir);
382            return NULL; /* need a page */
383        }
384        *dir = subdir;
385        *page = NULL;
386    }
387    shmem_dir_unmap(dir);
388    return shmem_swp_map(subdir) + offset;
389}
390
391static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
392{
393    long incdec = value? 1: -1;
394
395    entry->val = value;
396    info->swapped += incdec;
397    if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
398        struct page *page = kmap_atomic_to_page(entry);
399        set_page_private(page, page_private(page) + incdec);
400    }
401}
402
403/**
404 * shmem_swp_alloc - get the position of the swap entry for the page.
405 * @info: info structure for the inode
406 * @index: index of the page to find
407 * @sgp: check and recheck i_size? skip allocation?
408 *
409 * If the entry does not exist, allocate it.
410 */
411static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
412{
413    struct inode *inode = &info->vfs_inode;
414    struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
415    struct page *page = NULL;
416    swp_entry_t *entry;
417
418    if (sgp != SGP_WRITE &&
419        ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
420        return ERR_PTR(-EINVAL);
421
422    while (!(entry = shmem_swp_entry(info, index, &page))) {
423        if (sgp == SGP_READ)
424            return shmem_swp_map(ZERO_PAGE(0));
425        /*
426         * Test used_blocks against 1 less max_blocks, since we have 1 data
427         * page (and perhaps indirect index pages) yet to allocate:
428         * a waste to allocate index if we cannot allocate data.
429         */
430        if (sbinfo->max_blocks) {
431            if (percpu_counter_compare(&sbinfo->used_blocks,
432                        sbinfo->max_blocks - 1) >= 0)
433                return ERR_PTR(-ENOSPC);
434            percpu_counter_inc(&sbinfo->used_blocks);
435            spin_lock(&inode->i_lock);
436            inode->i_blocks += BLOCKS_PER_PAGE;
437            spin_unlock(&inode->i_lock);
438        }
439
440        spin_unlock(&info->lock);
441        page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
442        spin_lock(&info->lock);
443
444        if (!page) {
445            shmem_free_blocks(inode, 1);
446            return ERR_PTR(-ENOMEM);
447        }
448        if (sgp != SGP_WRITE &&
449            ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
450            entry = ERR_PTR(-EINVAL);
451            break;
452        }
453        if (info->next_index <= index)
454            info->next_index = index + 1;
455    }
456    if (page) {
457        /* another task gave its page, or truncated the file */
458        shmem_free_blocks(inode, 1);
459        shmem_dir_free(page);
460    }
461    if (info->next_index <= index && !IS_ERR(entry))
462        info->next_index = index + 1;
463    return entry;
464}
465
466/**
467 * shmem_free_swp - free some swap entries in a directory
468 * @dir: pointer to the directory
469 * @edir: pointer after last entry of the directory
470 * @punch_lock: pointer to spinlock when needed for the holepunch case
471 */
472static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
473                        spinlock_t *punch_lock)
474{
475    spinlock_t *punch_unlock = NULL;
476    swp_entry_t *ptr;
477    int freed = 0;
478
479    for (ptr = dir; ptr < edir; ptr++) {
480        if (ptr->val) {
481            if (unlikely(punch_lock)) {
482                punch_unlock = punch_lock;
483                punch_lock = NULL;
484                spin_lock(punch_unlock);
485                if (!ptr->val)
486                    continue;
487            }
488            free_swap_and_cache(*ptr);
489            *ptr = (swp_entry_t){0};
490            freed++;
491        }
492    }
493    if (punch_unlock)
494        spin_unlock(punch_unlock);
495    return freed;
496}
497
498static int shmem_map_and_free_swp(struct page *subdir, int offset,
499        int limit, struct page ***dir, spinlock_t *punch_lock)
500{
501    swp_entry_t *ptr;
502    int freed = 0;
503
504    ptr = shmem_swp_map(subdir);
505    for (; offset < limit; offset += LATENCY_LIMIT) {
506        int size = limit - offset;
507        if (size > LATENCY_LIMIT)
508            size = LATENCY_LIMIT;
509        freed += shmem_free_swp(ptr+offset, ptr+offset+size,
510                            punch_lock);
511        if (need_resched()) {
512            shmem_swp_unmap(ptr);
513            if (*dir) {
514                shmem_dir_unmap(*dir);
515                *dir = NULL;
516            }
517            cond_resched();
518            ptr = shmem_swp_map(subdir);
519        }
520    }
521    shmem_swp_unmap(ptr);
522    return freed;
523}
524
525static void shmem_free_pages(struct list_head *next)
526{
527    struct page *page;
528    int freed = 0;
529
530    do {
531        page = container_of(next, struct page, lru);
532        next = next->next;
533        shmem_dir_free(page);
534        freed++;
535        if (freed >= LATENCY_LIMIT) {
536            cond_resched();
537            freed = 0;
538        }
539    } while (next);
540}
541
542void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
543{
544    struct shmem_inode_info *info = SHMEM_I(inode);
545    unsigned long idx;
546    unsigned long size;
547    unsigned long limit;
548    unsigned long stage;
549    unsigned long diroff;
550    struct page **dir;
551    struct page *topdir;
552    struct page *middir;
553    struct page *subdir;
554    swp_entry_t *ptr;
555    LIST_HEAD(pages_to_free);
556    long nr_pages_to_free = 0;
557    long nr_swaps_freed = 0;
558    int offset;
559    int freed;
560    int punch_hole;
561    spinlock_t *needs_lock;
562    spinlock_t *punch_lock;
563    unsigned long upper_limit;
564
565    truncate_inode_pages_range(inode->i_mapping, start, end);
566
567    inode->i_ctime = inode->i_mtime = CURRENT_TIME;
568    idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
569    if (idx >= info->next_index)
570        return;
571
572    spin_lock(&info->lock);
573    info->flags |= SHMEM_TRUNCATE;
574    if (likely(end == (loff_t) -1)) {
575        limit = info->next_index;
576        upper_limit = SHMEM_MAX_INDEX;
577        info->next_index = idx;
578        needs_lock = NULL;
579        punch_hole = 0;
580    } else {
581        if (end + 1 >= inode->i_size) { /* we may free a little more */
582            limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
583                            PAGE_CACHE_SHIFT;
584            upper_limit = SHMEM_MAX_INDEX;
585        } else {
586            limit = (end + 1) >> PAGE_CACHE_SHIFT;
587            upper_limit = limit;
588        }
589        needs_lock = &info->lock;
590        punch_hole = 1;
591    }
592
593    topdir = info->i_indirect;
594    if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
595        info->i_indirect = NULL;
596        nr_pages_to_free++;
597        list_add(&topdir->lru, &pages_to_free);
598    }
599    spin_unlock(&info->lock);
600
601    if (info->swapped && idx < SHMEM_NR_DIRECT) {
602        ptr = info->i_direct;
603        size = limit;
604        if (size > SHMEM_NR_DIRECT)
605            size = SHMEM_NR_DIRECT;
606        nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
607    }
608
609    /*
610     * If there are no indirect blocks or we are punching a hole
611     * below indirect blocks, nothing to be done.
612     */
613    if (!topdir || limit <= SHMEM_NR_DIRECT)
614        goto done2;
615
616    /*
617     * The truncation case has already dropped info->lock, and we're safe
618     * because i_size and next_index have already been lowered, preventing
619     * access beyond. But in the punch_hole case, we still need to take
620     * the lock when updating the swap directory, because there might be
621     * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
622     * shmem_writepage. However, whenever we find we can remove a whole
623     * directory page (not at the misaligned start or end of the range),
624     * we first NULLify its pointer in the level above, and then have no
625     * need to take the lock when updating its contents: needs_lock and
626     * punch_lock (either pointing to info->lock or NULL) manage this.
627     */
628
629    upper_limit -= SHMEM_NR_DIRECT;
630    limit -= SHMEM_NR_DIRECT;
631    idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
632    offset = idx % ENTRIES_PER_PAGE;
633    idx -= offset;
634
635    dir = shmem_dir_map(topdir);
636    stage = ENTRIES_PER_PAGEPAGE/2;
637    if (idx < ENTRIES_PER_PAGEPAGE/2) {
638        middir = topdir;
639        diroff = idx/ENTRIES_PER_PAGE;
640    } else {
641        dir += ENTRIES_PER_PAGE/2;
642        dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
643        while (stage <= idx)
644            stage += ENTRIES_PER_PAGEPAGE;
645        middir = *dir;
646        if (*dir) {
647            diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
648                ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
649            if (!diroff && !offset && upper_limit >= stage) {
650                if (needs_lock) {
651                    spin_lock(needs_lock);
652                    *dir = NULL;
653                    spin_unlock(needs_lock);
654                    needs_lock = NULL;
655                } else
656                    *dir = NULL;
657                nr_pages_to_free++;
658                list_add(&middir->lru, &pages_to_free);
659            }
660            shmem_dir_unmap(dir);
661            dir = shmem_dir_map(middir);
662        } else {
663            diroff = 0;
664            offset = 0;
665            idx = stage;
666        }
667    }
668
669    for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
670        if (unlikely(idx == stage)) {
671            shmem_dir_unmap(dir);
672            dir = shmem_dir_map(topdir) +
673                ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
674            while (!*dir) {
675                dir++;
676                idx += ENTRIES_PER_PAGEPAGE;
677                if (idx >= limit)
678                    goto done1;
679            }
680            stage = idx + ENTRIES_PER_PAGEPAGE;
681            middir = *dir;
682            if (punch_hole)
683                needs_lock = &info->lock;
684            if (upper_limit >= stage) {
685                if (needs_lock) {
686                    spin_lock(needs_lock);
687                    *dir = NULL;
688                    spin_unlock(needs_lock);
689                    needs_lock = NULL;
690                } else
691                    *dir = NULL;
692                nr_pages_to_free++;
693                list_add(&middir->lru, &pages_to_free);
694            }
695            shmem_dir_unmap(dir);
696            cond_resched();
697            dir = shmem_dir_map(middir);
698            diroff = 0;
699        }
700        punch_lock = needs_lock;
701        subdir = dir[diroff];
702        if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
703            if (needs_lock) {
704                spin_lock(needs_lock);
705                dir[diroff] = NULL;
706                spin_unlock(needs_lock);
707                punch_lock = NULL;
708            } else
709                dir[diroff] = NULL;
710            nr_pages_to_free++;
711            list_add(&subdir->lru, &pages_to_free);
712        }
713        if (subdir && page_private(subdir) /* has swap entries */) {
714            size = limit - idx;
715            if (size > ENTRIES_PER_PAGE)
716                size = ENTRIES_PER_PAGE;
717            freed = shmem_map_and_free_swp(subdir,
718                    offset, size, &dir, punch_lock);
719            if (!dir)
720                dir = shmem_dir_map(middir);
721            nr_swaps_freed += freed;
722            if (offset || punch_lock) {
723                spin_lock(&info->lock);
724                set_page_private(subdir,
725                    page_private(subdir) - freed);
726                spin_unlock(&info->lock);
727            } else
728                BUG_ON(page_private(subdir) != freed);
729        }
730        offset = 0;
731    }
732done1:
733    shmem_dir_unmap(dir);
734done2:
735    if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
736        /*
737         * Call truncate_inode_pages again: racing shmem_unuse_inode
738         * may have swizzled a page in from swap since
739         * truncate_pagecache or generic_delete_inode did it, before we
740         * lowered next_index. Also, though shmem_getpage checks
741         * i_size before adding to cache, no recheck after: so fix the
742         * narrow window there too.
743         */
744        truncate_inode_pages_range(inode->i_mapping, start, end);
745    }
746
747    spin_lock(&info->lock);
748    info->flags &= ~SHMEM_TRUNCATE;
749    info->swapped -= nr_swaps_freed;
750    if (nr_pages_to_free)
751        shmem_free_blocks(inode, nr_pages_to_free);
752    shmem_recalc_inode(inode);
753    spin_unlock(&info->lock);
754
755    /*
756     * Empty swap vector directory pages to be freed?
757     */
758    if (!list_empty(&pages_to_free)) {
759        pages_to_free.prev->next = NULL;
760        shmem_free_pages(pages_to_free.next);
761    }
762}
763EXPORT_SYMBOL_GPL(shmem_truncate_range);
764
765static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
766{
767    struct inode *inode = dentry->d_inode;
768    int error;
769
770    error = inode_change_ok(inode, attr);
771    if (error)
772        return error;
773
774    if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
775        loff_t oldsize = inode->i_size;
776        loff_t newsize = attr->ia_size;
777        struct page *page = NULL;
778
779        if (newsize < oldsize) {
780            /*
781             * If truncating down to a partial page, then
782             * if that page is already allocated, hold it
783             * in memory until the truncation is over, so
784             * truncate_partial_page cannot miss it were
785             * it assigned to swap.
786             */
787            if (newsize & (PAGE_CACHE_SIZE-1)) {
788                (void) shmem_getpage(inode,
789                    newsize >> PAGE_CACHE_SHIFT,
790                        &page, SGP_READ, NULL);
791                if (page)
792                    unlock_page(page);
793            }
794            /*
795             * Reset SHMEM_PAGEIN flag so that shmem_truncate can
796             * detect if any pages might have been added to cache
797             * after truncate_inode_pages. But we needn't bother
798             * if it's being fully truncated to zero-length: the
799             * nrpages check is efficient enough in that case.
800             */
801            if (newsize) {
802                struct shmem_inode_info *info = SHMEM_I(inode);
803                spin_lock(&info->lock);
804                info->flags &= ~SHMEM_PAGEIN;
805                spin_unlock(&info->lock);
806            }
807        }
808        if (newsize != oldsize) {
809            i_size_write(inode, newsize);
810            inode->i_ctime = inode->i_mtime = CURRENT_TIME;
811        }
812        if (newsize < oldsize) {
813            loff_t holebegin = round_up(newsize, PAGE_SIZE);
814            unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
815            shmem_truncate_range(inode, newsize, (loff_t)-1);
816            /* unmap again to remove racily COWed private pages */
817            unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
818        }
819        if (page)
820            page_cache_release(page);
821    }
822
823    setattr_copy(inode, attr);
824#ifdef CONFIG_TMPFS_POSIX_ACL
825    if (attr->ia_valid & ATTR_MODE)
826        error = generic_acl_chmod(inode);
827#endif
828    return error;
829}
830
831static void shmem_evict_inode(struct inode *inode)
832{
833    struct shmem_inode_info *info = SHMEM_I(inode);
834    struct shmem_xattr *xattr, *nxattr;
835
836    if (inode->i_mapping->a_ops == &shmem_aops) {
837        shmem_unacct_size(info->flags, inode->i_size);
838        inode->i_size = 0;
839        shmem_truncate_range(inode, 0, (loff_t)-1);
840        if (!list_empty(&info->swaplist)) {
841            mutex_lock(&shmem_swaplist_mutex);
842            list_del_init(&info->swaplist);
843            mutex_unlock(&shmem_swaplist_mutex);
844        }
845    }
846
847    list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
848        kfree(xattr->name);
849        kfree(xattr);
850    }
851    BUG_ON(inode->i_blocks);
852    shmem_free_inode(inode->i_sb);
853    end_writeback(inode);
854}
855
856static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
857{
858    swp_entry_t *ptr;
859
860    for (ptr = dir; ptr < edir; ptr++) {
861        if (ptr->val == entry.val)
862            return ptr - dir;
863    }
864    return -1;
865}
866
867static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
868{
869    struct address_space *mapping;
870    unsigned long idx;
871    unsigned long size;
872    unsigned long limit;
873    unsigned long stage;
874    struct page **dir;
875    struct page *subdir;
876    swp_entry_t *ptr;
877    int offset;
878    int error;
879
880    idx = 0;
881    ptr = info->i_direct;
882    spin_lock(&info->lock);
883    if (!info->swapped) {
884        list_del_init(&info->swaplist);
885        goto lost2;
886    }
887    limit = info->next_index;
888    size = limit;
889    if (size > SHMEM_NR_DIRECT)
890        size = SHMEM_NR_DIRECT;
891    offset = shmem_find_swp(entry, ptr, ptr+size);
892    if (offset >= 0) {
893        shmem_swp_balance_unmap();
894        goto found;
895    }
896    if (!info->i_indirect)
897        goto lost2;
898
899    dir = shmem_dir_map(info->i_indirect);
900    stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
901
902    for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
903        if (unlikely(idx == stage)) {
904            shmem_dir_unmap(dir-1);
905            if (cond_resched_lock(&info->lock)) {
906                /* check it has not been truncated */
907                if (limit > info->next_index) {
908                    limit = info->next_index;
909                    if (idx >= limit)
910                        goto lost2;
911                }
912            }
913            dir = shmem_dir_map(info->i_indirect) +
914                ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
915            while (!*dir) {
916                dir++;
917                idx += ENTRIES_PER_PAGEPAGE;
918                if (idx >= limit)
919                    goto lost1;
920            }
921            stage = idx + ENTRIES_PER_PAGEPAGE;
922            subdir = *dir;
923            shmem_dir_unmap(dir);
924            dir = shmem_dir_map(subdir);
925        }
926        subdir = *dir;
927        if (subdir && page_private(subdir)) {
928            ptr = shmem_swp_map(subdir);
929            size = limit - idx;
930            if (size > ENTRIES_PER_PAGE)
931                size = ENTRIES_PER_PAGE;
932            offset = shmem_find_swp(entry, ptr, ptr+size);
933            shmem_swp_unmap(ptr);
934            if (offset >= 0) {
935                shmem_dir_unmap(dir);
936                ptr = shmem_swp_map(subdir);
937                goto found;
938            }
939        }
940    }
941lost1:
942    shmem_dir_unmap(dir-1);
943lost2:
944    spin_unlock(&info->lock);
945    return 0;
946found:
947    idx += offset;
948    ptr += offset;
949
950    /*
951     * Move _head_ to start search for next from here.
952     * But be careful: shmem_evict_inode checks list_empty without taking
953     * mutex, and there's an instant in list_move_tail when info->swaplist
954     * would appear empty, if it were the only one on shmem_swaplist. We
955     * could avoid doing it if inode NULL; or use this minor optimization.
956     */
957    if (shmem_swaplist.next != &info->swaplist)
958        list_move_tail(&shmem_swaplist, &info->swaplist);
959
960    /*
961     * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
962     * but also to hold up shmem_evict_inode(): so inode cannot be freed
963     * beneath us (pagelock doesn't help until the page is in pagecache).
964     */
965    mapping = info->vfs_inode.i_mapping;
966    error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
967    /* which does mem_cgroup_uncharge_cache_page on error */
968
969    if (error == -EEXIST) {
970        struct page *filepage = find_get_page(mapping, idx);
971        error = 1;
972        if (filepage) {
973            /*
974             * There might be a more uptodate page coming down
975             * from a stacked writepage: forget our swappage if so.
976             */
977            if (PageUptodate(filepage))
978                error = 0;
979            page_cache_release(filepage);
980        }
981    }
982    if (!error) {
983        delete_from_swap_cache(page);
984        set_page_dirty(page);
985        info->flags |= SHMEM_PAGEIN;
986        shmem_swp_set(info, ptr, 0);
987        swap_free(entry);
988        error = 1; /* not an error, but entry was found */
989    }
990    shmem_swp_unmap(ptr);
991    spin_unlock(&info->lock);
992    return error;
993}
994
995/*
996 * shmem_unuse() search for an eventually swapped out shmem page.
997 */
998int shmem_unuse(swp_entry_t entry, struct page *page)
999{
1000    struct list_head *p, *next;
1001    struct shmem_inode_info *info;
1002    int found = 0;
1003    int error;
1004
1005    /*
1006     * Charge page using GFP_KERNEL while we can wait, before taking
1007     * the shmem_swaplist_mutex which might hold up shmem_writepage().
1008     * Charged back to the user (not to caller) when swap account is used.
1009     * add_to_page_cache() will be called with GFP_NOWAIT.
1010     */
1011    error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
1012    if (error)
1013        goto out;
1014    /*
1015     * Try to preload while we can wait, to not make a habit of
1016     * draining atomic reserves; but don't latch on to this cpu,
1017     * it's okay if sometimes we get rescheduled after this.
1018     */
1019    error = radix_tree_preload(GFP_KERNEL);
1020    if (error)
1021        goto uncharge;
1022    radix_tree_preload_end();
1023
1024    mutex_lock(&shmem_swaplist_mutex);
1025    list_for_each_safe(p, next, &shmem_swaplist) {
1026        info = list_entry(p, struct shmem_inode_info, swaplist);
1027        found = shmem_unuse_inode(info, entry, page);
1028        cond_resched();
1029        if (found)
1030            break;
1031    }
1032    mutex_unlock(&shmem_swaplist_mutex);
1033
1034uncharge:
1035    if (!found)
1036        mem_cgroup_uncharge_cache_page(page);
1037    if (found < 0)
1038        error = found;
1039out:
1040    unlock_page(page);
1041    page_cache_release(page);
1042    return error;
1043}
1044
1045/*
1046 * Move the page from the page cache to the swap cache.
1047 */
1048static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1049{
1050    struct shmem_inode_info *info;
1051    swp_entry_t *entry, swap;
1052    struct address_space *mapping;
1053    unsigned long index;
1054    struct inode *inode;
1055
1056    BUG_ON(!PageLocked(page));
1057    mapping = page->mapping;
1058    index = page->index;
1059    inode = mapping->host;
1060    info = SHMEM_I(inode);
1061    if (info->flags & VM_LOCKED)
1062        goto redirty;
1063    if (!total_swap_pages)
1064        goto redirty;
1065
1066    /*
1067     * shmem_backing_dev_info's capabilities prevent regular writeback or
1068     * sync from ever calling shmem_writepage; but a stacking filesystem
1069     * may use the ->writepage of its underlying filesystem, in which case
1070     * tmpfs should write out to swap only in response to memory pressure,
1071     * and not for the writeback threads or sync. However, in those cases,
1072     * we do still want to check if there's a redundant swappage to be
1073     * discarded.
1074     */
1075    if (wbc->for_reclaim)
1076        swap = get_swap_page();
1077    else
1078        swap.val = 0;
1079
1080    /*
1081     * Add inode to shmem_unuse()'s list of swapped-out inodes,
1082     * if it's not already there. Do it now because we cannot take
1083     * mutex while holding spinlock, and must do so before the page
1084     * is moved to swap cache, when its pagelock no longer protects
1085     * the inode from eviction. But don't unlock the mutex until
1086     * we've taken the spinlock, because shmem_unuse_inode() will
1087     * prune a !swapped inode from the swaplist under both locks.
1088     */
1089    if (swap.val) {
1090        mutex_lock(&shmem_swaplist_mutex);
1091        if (list_empty(&info->swaplist))
1092            list_add_tail(&info->swaplist, &shmem_swaplist);
1093    }
1094
1095    spin_lock(&info->lock);
1096    if (swap.val)
1097        mutex_unlock(&shmem_swaplist_mutex);
1098
1099    if (index >= info->next_index) {
1100        BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1101        goto unlock;
1102    }
1103    entry = shmem_swp_entry(info, index, NULL);
1104    if (entry->val) {
1105        /*
1106         * The more uptodate page coming down from a stacked
1107         * writepage should replace our old swappage.
1108         */
1109        free_swap_and_cache(*entry);
1110        shmem_swp_set(info, entry, 0);
1111    }
1112    shmem_recalc_inode(inode);
1113
1114    if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1115        delete_from_page_cache(page);
1116        shmem_swp_set(info, entry, swap.val);
1117        shmem_swp_unmap(entry);
1118        swap_shmem_alloc(swap);
1119        spin_unlock(&info->lock);
1120        BUG_ON(page_mapped(page));
1121        swap_writepage(page, wbc);
1122        return 0;
1123    }
1124
1125    shmem_swp_unmap(entry);
1126unlock:
1127    spin_unlock(&info->lock);
1128    /*
1129     * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1130     * clear SWAP_HAS_CACHE flag.
1131     */
1132    swapcache_free(swap, NULL);
1133redirty:
1134    set_page_dirty(page);
1135    if (wbc->for_reclaim)
1136        return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
1137    unlock_page(page);
1138    return 0;
1139}
1140
1141#ifdef CONFIG_NUMA
1142#ifdef CONFIG_TMPFS
1143static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1144{
1145    char buffer[64];
1146
1147    if (!mpol || mpol->mode == MPOL_DEFAULT)
1148        return; /* show nothing */
1149
1150    mpol_to_str(buffer, sizeof(buffer), mpol, 1);
1151
1152    seq_printf(seq, ",mpol=%s", buffer);
1153}
1154
1155static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1156{
1157    struct mempolicy *mpol = NULL;
1158    if (sbinfo->mpol) {
1159        spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
1160        mpol = sbinfo->mpol;
1161        mpol_get(mpol);
1162        spin_unlock(&sbinfo->stat_lock);
1163    }
1164    return mpol;
1165}
1166#endif /* CONFIG_TMPFS */
1167
1168static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1169            struct shmem_inode_info *info, unsigned long idx)
1170{
1171    struct mempolicy mpol, *spol;
1172    struct vm_area_struct pvma;
1173    struct page *page;
1174
1175    spol = mpol_cond_copy(&mpol,
1176                mpol_shared_policy_lookup(&info->policy, idx));
1177
1178    /* Create a pseudo vma that just contains the policy */
1179    pvma.vm_start = 0;
1180    pvma.vm_pgoff = idx;
1181    pvma.vm_ops = NULL;
1182    pvma.vm_policy = spol;
1183    page = swapin_readahead(entry, gfp, &pvma, 0);
1184    return page;
1185}
1186
1187static struct page *shmem_alloc_page(gfp_t gfp,
1188            struct shmem_inode_info *info, unsigned long idx)
1189{
1190    struct vm_area_struct pvma;
1191
1192    /* Create a pseudo vma that just contains the policy */
1193    pvma.vm_start = 0;
1194    pvma.vm_pgoff = idx;
1195    pvma.vm_ops = NULL;
1196    pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1197
1198    /*
1199     * alloc_page_vma() will drop the shared policy reference
1200     */
1201    return alloc_page_vma(gfp, &pvma, 0);
1202}
1203#else /* !CONFIG_NUMA */
1204#ifdef CONFIG_TMPFS
1205static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
1206{
1207}
1208#endif /* CONFIG_TMPFS */
1209
1210static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1211            struct shmem_inode_info *info, unsigned long idx)
1212{
1213    return swapin_readahead(entry, gfp, NULL, 0);
1214}
1215
1216static inline struct page *shmem_alloc_page(gfp_t gfp,
1217            struct shmem_inode_info *info, unsigned long idx)
1218{
1219    return alloc_page(gfp);
1220}
1221#endif /* CONFIG_NUMA */
1222
1223#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
1224static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1225{
1226    return NULL;
1227}
1228#endif
1229
1230/*
1231 * shmem_getpage - either get the page from swap or allocate a new one
1232 *
1233 * If we allocate a new one we do not mark it dirty. That's up to the
1234 * vm. If we swap it in we mark it dirty since we also free the swap
1235 * entry since a page cannot live in both the swap and page cache
1236 */
1237static int shmem_getpage(struct inode *inode, unsigned long idx,
1238            struct page **pagep, enum sgp_type sgp, int *type)
1239{
1240    struct address_space *mapping = inode->i_mapping;
1241    struct shmem_inode_info *info = SHMEM_I(inode);
1242    struct shmem_sb_info *sbinfo;
1243    struct page *filepage = *pagep;
1244    struct page *swappage;
1245    struct page *prealloc_page = NULL;
1246    swp_entry_t *entry;
1247    swp_entry_t swap;
1248    gfp_t gfp;
1249    int error;
1250
1251    if (idx >= SHMEM_MAX_INDEX)
1252        return -EFBIG;
1253
1254    if (type)
1255        *type = 0;
1256
1257    /*
1258     * Normally, filepage is NULL on entry, and either found
1259     * uptodate immediately, or allocated and zeroed, or read
1260     * in under swappage, which is then assigned to filepage.
1261     * But shmem_readpage (required for splice) passes in a locked
1262     * filepage, which may be found not uptodate by other callers
1263     * too, and may need to be copied from the swappage read in.
1264     */
1265repeat:
1266    if (!filepage)
1267        filepage = find_lock_page(mapping, idx);
1268    if (filepage && PageUptodate(filepage))
1269        goto done;
1270    gfp = mapping_gfp_mask(mapping);
1271    if (!filepage) {
1272        /*
1273         * Try to preload while we can wait, to not make a habit of
1274         * draining atomic reserves; but don't latch on to this cpu.
1275         */
1276        error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
1277        if (error)
1278            goto failed;
1279        radix_tree_preload_end();
1280        if (sgp != SGP_READ && !prealloc_page) {
1281            /* We don't care if this fails */
1282            prealloc_page = shmem_alloc_page(gfp, info, idx);
1283            if (prealloc_page) {
1284                if (mem_cgroup_cache_charge(prealloc_page,
1285                        current->mm, GFP_KERNEL)) {
1286                    page_cache_release(prealloc_page);
1287                    prealloc_page = NULL;
1288                }
1289            }
1290        }
1291    }
1292    error = 0;
1293
1294    spin_lock(&info->lock);
1295    shmem_recalc_inode(inode);
1296    entry = shmem_swp_alloc(info, idx, sgp);
1297    if (IS_ERR(entry)) {
1298        spin_unlock(&info->lock);
1299        error = PTR_ERR(entry);
1300        goto failed;
1301    }
1302    swap = *entry;
1303
1304    if (swap.val) {
1305        /* Look it up and read it in.. */
1306        swappage = lookup_swap_cache(swap);
1307        if (!swappage) {
1308            shmem_swp_unmap(entry);
1309            spin_unlock(&info->lock);
1310            /* here we actually do the io */
1311            if (type)
1312                *type |= VM_FAULT_MAJOR;
1313            swappage = shmem_swapin(swap, gfp, info, idx);
1314            if (!swappage) {
1315                spin_lock(&info->lock);
1316                entry = shmem_swp_alloc(info, idx, sgp);
1317                if (IS_ERR(entry))
1318                    error = PTR_ERR(entry);
1319                else {
1320                    if (entry->val == swap.val)
1321                        error = -ENOMEM;
1322                    shmem_swp_unmap(entry);
1323                }
1324                spin_unlock(&info->lock);
1325                if (error)
1326                    goto failed;
1327                goto repeat;
1328            }
1329            wait_on_page_locked(swappage);
1330            page_cache_release(swappage);
1331            goto repeat;
1332        }
1333
1334        /* We have to do this with page locked to prevent races */
1335        if (!trylock_page(swappage)) {
1336            shmem_swp_unmap(entry);
1337            spin_unlock(&info->lock);
1338            wait_on_page_locked(swappage);
1339            page_cache_release(swappage);
1340            goto repeat;
1341        }
1342        if (PageWriteback(swappage)) {
1343            shmem_swp_unmap(entry);
1344            spin_unlock(&info->lock);
1345            wait_on_page_writeback(swappage);
1346            unlock_page(swappage);
1347            page_cache_release(swappage);
1348            goto repeat;
1349        }
1350        if (!PageUptodate(swappage)) {
1351            shmem_swp_unmap(entry);
1352            spin_unlock(&info->lock);
1353            unlock_page(swappage);
1354            page_cache_release(swappage);
1355            error = -EIO;
1356            goto failed;
1357        }
1358
1359        if (filepage) {
1360            shmem_swp_set(info, entry, 0);
1361            shmem_swp_unmap(entry);
1362            delete_from_swap_cache(swappage);
1363            spin_unlock(&info->lock);
1364            copy_highpage(filepage, swappage);
1365            unlock_page(swappage);
1366            page_cache_release(swappage);
1367            flush_dcache_page(filepage);
1368            SetPageUptodate(filepage);
1369            set_page_dirty(filepage);
1370            swap_free(swap);
1371        } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1372                    idx, GFP_NOWAIT))) {
1373            info->flags |= SHMEM_PAGEIN;
1374            shmem_swp_set(info, entry, 0);
1375            shmem_swp_unmap(entry);
1376            delete_from_swap_cache(swappage);
1377            spin_unlock(&info->lock);
1378            filepage = swappage;
1379            set_page_dirty(filepage);
1380            swap_free(swap);
1381        } else {
1382            shmem_swp_unmap(entry);
1383            spin_unlock(&info->lock);
1384            if (error == -ENOMEM) {
1385                /*
1386                 * reclaim from proper memory cgroup and
1387                 * call memcg's OOM if needed.
1388                 */
1389                error = mem_cgroup_shmem_charge_fallback(
1390                                swappage,
1391                                current->mm,
1392                                gfp);
1393                if (error) {
1394                    unlock_page(swappage);
1395                    page_cache_release(swappage);
1396                    goto failed;
1397                }
1398            }
1399            unlock_page(swappage);
1400            page_cache_release(swappage);
1401            goto repeat;
1402        }
1403    } else if (sgp == SGP_READ && !filepage) {
1404        shmem_swp_unmap(entry);
1405        filepage = find_get_page(mapping, idx);
1406        if (filepage &&
1407            (!PageUptodate(filepage) || !trylock_page(filepage))) {
1408            spin_unlock(&info->lock);
1409            wait_on_page_locked(filepage);
1410            page_cache_release(filepage);
1411            filepage = NULL;
1412            goto repeat;
1413        }
1414        spin_unlock(&info->lock);
1415    } else {
1416        shmem_swp_unmap(entry);
1417        sbinfo = SHMEM_SB(inode->i_sb);
1418        if (sbinfo->max_blocks) {
1419            if (percpu_counter_compare(&sbinfo->used_blocks,
1420                        sbinfo->max_blocks) >= 0 ||
1421                shmem_acct_block(info->flags))
1422                goto nospace;
1423            percpu_counter_inc(&sbinfo->used_blocks);
1424            spin_lock(&inode->i_lock);
1425            inode->i_blocks += BLOCKS_PER_PAGE;
1426            spin_unlock(&inode->i_lock);
1427        } else if (shmem_acct_block(info->flags))
1428            goto nospace;
1429
1430        if (!filepage) {
1431            int ret;
1432
1433            if (!prealloc_page) {
1434                spin_unlock(&info->lock);
1435                filepage = shmem_alloc_page(gfp, info, idx);
1436                if (!filepage) {
1437                    shmem_unacct_blocks(info->flags, 1);
1438                    shmem_free_blocks(inode, 1);
1439                    error = -ENOMEM;
1440                    goto failed;
1441                }
1442                SetPageSwapBacked(filepage);
1443
1444                /*
1445                 * Precharge page while we can wait, compensate
1446                 * after
1447                 */
1448                error = mem_cgroup_cache_charge(filepage,
1449                    current->mm, GFP_KERNEL);
1450                if (error) {
1451                    page_cache_release(filepage);
1452                    shmem_unacct_blocks(info->flags, 1);
1453                    shmem_free_blocks(inode, 1);
1454                    filepage = NULL;
1455                    goto failed;
1456                }
1457
1458                spin_lock(&info->lock);
1459            } else {
1460                filepage = prealloc_page;
1461                prealloc_page = NULL;
1462                SetPageSwapBacked(filepage);
1463            }
1464
1465            entry = shmem_swp_alloc(info, idx, sgp);
1466            if (IS_ERR(entry))
1467                error = PTR_ERR(entry);
1468            else {
1469                swap = *entry;
1470                shmem_swp_unmap(entry);
1471            }
1472            ret = error || swap.val;
1473            if (ret)
1474                mem_cgroup_uncharge_cache_page(filepage);
1475            else
1476                ret = add_to_page_cache_lru(filepage, mapping,
1477                        idx, GFP_NOWAIT);
1478            /*
1479             * At add_to_page_cache_lru() failure, uncharge will
1480             * be done automatically.
1481             */
1482            if (ret) {
1483                spin_unlock(&info->lock);
1484                page_cache_release(filepage);
1485                shmem_unacct_blocks(info->flags, 1);
1486                shmem_free_blocks(inode, 1);
1487                filepage = NULL;
1488                if (error)
1489                    goto failed;
1490                goto repeat;
1491            }
1492            info->flags |= SHMEM_PAGEIN;
1493        }
1494
1495        info->alloced++;
1496        spin_unlock(&info->lock);
1497        clear_highpage(filepage);
1498        flush_dcache_page(filepage);
1499        SetPageUptodate(filepage);
1500        if (sgp == SGP_DIRTY)
1501            set_page_dirty(filepage);
1502    }
1503done:
1504    *pagep = filepage;
1505    error = 0;
1506    goto out;
1507
1508nospace:
1509    /*
1510     * Perhaps the page was brought in from swap between find_lock_page
1511     * and taking info->lock? We allow for that at add_to_page_cache_lru,
1512     * but must also avoid reporting a spurious ENOSPC while working on a
1513     * full tmpfs. (When filepage has been passed in to shmem_getpage, it
1514     * is already in page cache, which prevents this race from occurring.)
1515     */
1516    if (!filepage) {
1517        struct page *page = find_get_page(mapping, idx);
1518        if (page) {
1519            spin_unlock(&info->lock);
1520            page_cache_release(page);
1521            goto repeat;
1522        }
1523    }
1524    spin_unlock(&info->lock);
1525    error = -ENOSPC;
1526failed:
1527    if (*pagep != filepage) {
1528        unlock_page(filepage);
1529        page_cache_release(filepage);
1530    }
1531out:
1532    if (prealloc_page) {
1533        mem_cgroup_uncharge_cache_page(prealloc_page);
1534        page_cache_release(prealloc_page);
1535    }
1536    return error;
1537}
1538
1539static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1540{
1541    struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1542    int error;
1543    int ret;
1544
1545    if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1546        return VM_FAULT_SIGBUS;
1547
1548    error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1549    if (error)
1550        return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1551    if (ret & VM_FAULT_MAJOR) {
1552        count_vm_event(PGMAJFAULT);
1553        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1554    }
1555    return ret | VM_FAULT_LOCKED;
1556}
1557
1558#ifdef CONFIG_NUMA
1559static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
1560{
1561    struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1562    return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
1563}
1564
1565static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1566                      unsigned long addr)
1567{
1568    struct inode *i = vma->vm_file->f_path.dentry->d_inode;
1569    unsigned long idx;
1570
1571    idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1572    return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
1573}
1574#endif
1575
1576int shmem_lock(struct file *file, int lock, struct user_struct *user)
1577{
1578    struct inode *inode = file->f_path.dentry->d_inode;
1579    struct shmem_inode_info *info = SHMEM_I(inode);
1580    int retval = -ENOMEM;
1581
1582    spin_lock(&info->lock);
1583    if (lock && !(info->flags & VM_LOCKED)) {
1584        if (!user_shm_lock(inode->i_size, user))
1585            goto out_nomem;
1586        info->flags |= VM_LOCKED;
1587        mapping_set_unevictable(file->f_mapping);
1588    }
1589    if (!lock && (info->flags & VM_LOCKED) && user) {
1590        user_shm_unlock(inode->i_size, user);
1591        info->flags &= ~VM_LOCKED;
1592        mapping_clear_unevictable(file->f_mapping);
1593        scan_mapping_unevictable_pages(file->f_mapping);
1594    }
1595    retval = 0;
1596
1597out_nomem:
1598    spin_unlock(&info->lock);
1599    return retval;
1600}
1601
1602static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1603{
1604    file_accessed(file);
1605    vma->vm_ops = &shmem_vm_ops;
1606    vma->vm_flags |= VM_CAN_NONLINEAR;
1607    return 0;
1608}
1609
1610static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
1611                     int mode, dev_t dev, unsigned long flags)
1612{
1613    struct inode *inode;
1614    struct shmem_inode_info *info;
1615    struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1616
1617    if (shmem_reserve_inode(sb))
1618        return NULL;
1619
1620    inode = new_inode(sb);
1621    if (inode) {
1622        inode->i_ino = get_next_ino();
1623        inode_init_owner(inode, dir, mode);
1624        inode->i_blocks = 0;
1625        inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1626        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1627        inode->i_generation = get_seconds();
1628        info = SHMEM_I(inode);
1629        memset(info, 0, (char *)inode - (char *)info);
1630        spin_lock_init(&info->lock);
1631        info->flags = flags & VM_NORESERVE;
1632        INIT_LIST_HEAD(&info->swaplist);
1633        INIT_LIST_HEAD(&info->xattr_list);
1634        cache_no_acl(inode);
1635
1636        switch (mode & S_IFMT) {
1637        default:
1638            inode->i_op = &shmem_special_inode_operations;
1639            init_special_inode(inode, mode, dev);
1640            break;
1641        case S_IFREG:
1642            inode->i_mapping->a_ops = &shmem_aops;
1643            inode->i_op = &shmem_inode_operations;
1644            inode->i_fop = &shmem_file_operations;
1645            mpol_shared_policy_init(&info->policy,
1646                         shmem_get_sbmpol(sbinfo));
1647            break;
1648        case S_IFDIR:
1649            inc_nlink(inode);
1650            /* Some things misbehave if size == 0 on a directory */
1651            inode->i_size = 2 * BOGO_DIRENT_SIZE;
1652            inode->i_op = &shmem_dir_inode_operations;
1653            inode->i_fop = &simple_dir_operations;
1654            break;
1655        case S_IFLNK:
1656            /*
1657             * Must not load anything in the rbtree,
1658             * mpol_free_shared_policy will not be called.
1659             */
1660            mpol_shared_policy_init(&info->policy, NULL);
1661            break;
1662        }
1663    } else
1664        shmem_free_inode(sb);
1665    return inode;
1666}
1667
1668#ifdef CONFIG_TMPFS
1669static const struct inode_operations shmem_symlink_inode_operations;
1670static const struct inode_operations shmem_symlink_inline_operations;
1671
1672/*
1673 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1674 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1675 * below the loop driver, in the generic fashion that many filesystems support.
1676 */
1677static int shmem_readpage(struct file *file, struct page *page)
1678{
1679    struct inode *inode = page->mapping->host;
1680    int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1681    unlock_page(page);
1682    return error;
1683}
1684
1685static int
1686shmem_write_begin(struct file *file, struct address_space *mapping,
1687            loff_t pos, unsigned len, unsigned flags,
1688            struct page **pagep, void **fsdata)
1689{
1690    struct inode *inode = mapping->host;
1691    pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1692    *pagep = NULL;
1693    return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1694}
1695
1696static int
1697shmem_write_end(struct file *file, struct address_space *mapping,
1698            loff_t pos, unsigned len, unsigned copied,
1699            struct page *page, void *fsdata)
1700{
1701    struct inode *inode = mapping->host;
1702
1703    if (pos + copied > inode->i_size)
1704        i_size_write(inode, pos + copied);
1705
1706    set_page_dirty(page);
1707    unlock_page(page);
1708    page_cache_release(page);
1709
1710    return copied;
1711}
1712
1713static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1714{
1715    struct inode *inode = filp->f_path.dentry->d_inode;
1716    struct address_space *mapping = inode->i_mapping;
1717    unsigned long index, offset;
1718    enum sgp_type sgp = SGP_READ;
1719
1720    /*
1721     * Might this read be for a stacking filesystem? Then when reading
1722     * holes of a sparse file, we actually need to allocate those pages,
1723     * and even mark them dirty, so it cannot exceed the max_blocks limit.
1724     */
1725    if (segment_eq(get_fs(), KERNEL_DS))
1726        sgp = SGP_DIRTY;
1727
1728    index = *ppos >> PAGE_CACHE_SHIFT;
1729    offset = *ppos & ~PAGE_CACHE_MASK;
1730
1731    for (;;) {
1732        struct page *page = NULL;
1733        unsigned long end_index, nr, ret;
1734        loff_t i_size = i_size_read(inode);
1735
1736        end_index = i_size >> PAGE_CACHE_SHIFT;
1737        if (index > end_index)
1738            break;
1739        if (index == end_index) {
1740            nr = i_size & ~PAGE_CACHE_MASK;
1741            if (nr <= offset)
1742                break;
1743        }
1744
1745        desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
1746        if (desc->error) {
1747            if (desc->error == -EINVAL)
1748                desc->error = 0;
1749            break;
1750        }
1751        if (page)
1752            unlock_page(page);
1753
1754        /*
1755         * We must evaluate after, since reads (unlike writes)
1756         * are called without i_mutex protection against truncate
1757         */
1758        nr = PAGE_CACHE_SIZE;
1759        i_size = i_size_read(inode);
1760        end_index = i_size >> PAGE_CACHE_SHIFT;
1761        if (index == end_index) {
1762            nr = i_size & ~PAGE_CACHE_MASK;
1763            if (nr <= offset) {
1764                if (page)
1765                    page_cache_release(page);
1766                break;
1767            }
1768        }
1769        nr -= offset;
1770
1771        if (page) {
1772            /*
1773             * If users can be writing to this page using arbitrary
1774             * virtual addresses, take care about potential aliasing
1775             * before reading the page on the kernel side.
1776             */
1777            if (mapping_writably_mapped(mapping))
1778                flush_dcache_page(page);
1779            /*
1780             * Mark the page accessed if we read the beginning.
1781             */
1782            if (!offset)
1783                mark_page_accessed(page);
1784        } else {
1785            page = ZERO_PAGE(0);
1786            page_cache_get(page);
1787        }
1788
1789        /*
1790         * Ok, we have the page, and it's up-to-date, so
1791         * now we can copy it to user space...
1792         *
1793         * The actor routine returns how many bytes were actually used..
1794         * NOTE! This may not be the same as how much of a user buffer
1795         * we filled up (we may be padding etc), so we can only update
1796         * "pos" here (the actor routine has to update the user buffer
1797         * pointers and the remaining count).
1798         */
1799        ret = actor(desc, page, offset, nr);
1800        offset += ret;
1801        index += offset >> PAGE_CACHE_SHIFT;
1802        offset &= ~PAGE_CACHE_MASK;
1803
1804        page_cache_release(page);
1805        if (ret != nr || !desc->count)
1806            break;
1807
1808        cond_resched();
1809    }
1810
1811    *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1812    file_accessed(filp);
1813}
1814
1815static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1816        const struct iovec *iov, unsigned long nr_segs, loff_t pos)
1817{
1818    struct file *filp = iocb->ki_filp;
1819    ssize_t retval;
1820    unsigned long seg;
1821    size_t count;
1822    loff_t *ppos = &iocb->ki_pos;
1823
1824    retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1825    if (retval)
1826        return retval;
1827
1828    for (seg = 0; seg < nr_segs; seg++) {
1829        read_descriptor_t desc;
1830
1831        desc.written = 0;
1832        desc.arg.buf = iov[seg].iov_base;
1833        desc.count = iov[seg].iov_len;
1834        if (desc.count == 0)
1835            continue;
1836        desc.error = 0;
1837        do_shmem_file_read(filp, ppos, &desc, file_read_actor);
1838        retval += desc.written;
1839        if (desc.error) {
1840            retval = retval ?: desc.error;
1841            break;
1842        }
1843        if (desc.count > 0)
1844            break;
1845    }
1846    return retval;
1847}
1848
1849static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1850{
1851    struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
1852
1853    buf->f_type = TMPFS_MAGIC;
1854    buf->f_bsize = PAGE_CACHE_SIZE;
1855    buf->f_namelen = NAME_MAX;
1856    if (sbinfo->max_blocks) {
1857        buf->f_blocks = sbinfo->max_blocks;
1858        buf->f_bavail = buf->f_bfree =
1859                sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
1860    }
1861    if (sbinfo->max_inodes) {
1862        buf->f_files = sbinfo->max_inodes;
1863        buf->f_ffree = sbinfo->free_inodes;
1864    }
1865    /* else leave those fields 0 like simple_statfs */
1866    return 0;
1867}
1868
1869/*
1870 * File creation. Allocate an inode, and we're done..
1871 */
1872static int
1873shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1874{
1875    struct inode *inode;
1876    int error = -ENOSPC;
1877
1878    inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1879    if (inode) {
1880        error = security_inode_init_security(inode, dir,
1881                             &dentry->d_name, NULL,
1882                             NULL, NULL);
1883        if (error) {
1884            if (error != -EOPNOTSUPP) {
1885                iput(inode);
1886                return error;
1887            }
1888        }
1889#ifdef CONFIG_TMPFS_POSIX_ACL
1890        error = generic_acl_init(inode, dir);
1891        if (error) {
1892            iput(inode);
1893            return error;
1894        }
1895#else
1896        error = 0;
1897#endif
1898        dir->i_size += BOGO_DIRENT_SIZE;
1899        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1900        d_instantiate(dentry, inode);
1901        dget(dentry); /* Extra count - pin the dentry in core */
1902    }
1903    return error;
1904}
1905
1906static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1907{
1908    int error;
1909
1910    if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1911        return error;
1912    inc_nlink(dir);
1913    return 0;
1914}
1915
1916static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1917        struct nameidata *nd)
1918{
1919    return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1920}
1921
1922/*
1923 * Link a file..
1924 */
1925static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1926{
1927    struct inode *inode = old_dentry->d_inode;
1928    int ret;
1929
1930    /*
1931     * No ordinary (disk based) filesystem counts links as inodes;
1932     * but each new link needs a new dentry, pinning lowmem, and
1933     * tmpfs dentries cannot be pruned until they are unlinked.
1934     */
1935    ret = shmem_reserve_inode(inode->i_sb);
1936    if (ret)
1937        goto out;
1938
1939    dir->i_size += BOGO_DIRENT_SIZE;
1940    inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1941    inc_nlink(inode);
1942    ihold(inode); /* New dentry reference */
1943    dget(dentry); /* Extra pinning count for the created dentry */
1944    d_instantiate(dentry, inode);
1945out:
1946    return ret;
1947}
1948
1949static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1950{
1951    struct inode *inode = dentry->d_inode;
1952
1953    if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
1954        shmem_free_inode(inode->i_sb);
1955
1956    dir->i_size -= BOGO_DIRENT_SIZE;
1957    inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1958    drop_nlink(inode);
1959    dput(dentry); /* Undo the count from "create" - this does all the work */
1960    return 0;
1961}
1962
1963static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1964{
1965    if (!simple_empty(dentry))
1966        return -ENOTEMPTY;
1967
1968    drop_nlink(dentry->d_inode);
1969    drop_nlink(dir);
1970    return shmem_unlink(dir, dentry);
1971}
1972
1973/*
1974 * The VFS layer already does all the dentry stuff for rename,
1975 * we just have to decrement the usage count for the target if
1976 * it exists so that the VFS layer correctly free's it when it
1977 * gets overwritten.
1978 */
1979static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1980{
1981    struct inode *inode = old_dentry->d_inode;
1982    int they_are_dirs = S_ISDIR(inode->i_mode);
1983
1984    if (!simple_empty(new_dentry))
1985        return -ENOTEMPTY;
1986
1987    if (new_dentry->d_inode) {
1988        (void) shmem_unlink(new_dir, new_dentry);
1989        if (they_are_dirs)
1990            drop_nlink(old_dir);
1991    } else if (they_are_dirs) {
1992        drop_nlink(old_dir);
1993        inc_nlink(new_dir);
1994    }
1995
1996    old_dir->i_size -= BOGO_DIRENT_SIZE;
1997    new_dir->i_size += BOGO_DIRENT_SIZE;
1998    old_dir->i_ctime = old_dir->i_mtime =
1999    new_dir->i_ctime = new_dir->i_mtime =
2000    inode->i_ctime = CURRENT_TIME;
2001    return 0;
2002}
2003
2004static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
2005{
2006    int error;
2007    int len;
2008    struct inode *inode;
2009    struct page *page = NULL;
2010    char *kaddr;
2011    struct shmem_inode_info *info;
2012
2013    len = strlen(symname) + 1;
2014    if (len > PAGE_CACHE_SIZE)
2015        return -ENAMETOOLONG;
2016
2017    inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
2018    if (!inode)
2019        return -ENOSPC;
2020
2021    error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
2022                         NULL, NULL);
2023    if (error) {
2024        if (error != -EOPNOTSUPP) {
2025            iput(inode);
2026            return error;
2027        }
2028        error = 0;
2029    }
2030
2031    info = SHMEM_I(inode);
2032    inode->i_size = len-1;
2033    if (len <= SHMEM_SYMLINK_INLINE_LEN) {
2034        /* do it inline */
2035        memcpy(info->inline_symlink, symname, len);
2036        inode->i_op = &shmem_symlink_inline_operations;
2037    } else {
2038        error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2039        if (error) {
2040            iput(inode);
2041            return error;
2042        }
2043        inode->i_mapping->a_ops = &shmem_aops;
2044        inode->i_op = &shmem_symlink_inode_operations;
2045        kaddr = kmap_atomic(page, KM_USER0);
2046        memcpy(kaddr, symname, len);
2047        kunmap_atomic(kaddr, KM_USER0);
2048        set_page_dirty(page);
2049        unlock_page(page);
2050        page_cache_release(page);
2051    }
2052    dir->i_size += BOGO_DIRENT_SIZE;
2053    dir->i_ctime = dir->i_mtime = CURRENT_TIME;
2054    d_instantiate(dentry, inode);
2055    dget(dentry);
2056    return 0;
2057}
2058
2059static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
2060{
2061    nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
2062    return NULL;
2063}
2064
2065static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2066{
2067    struct page *page = NULL;
2068    int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2069    nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
2070    if (page)
2071        unlock_page(page);
2072    return page;
2073}
2074
2075static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2076{
2077    if (!IS_ERR(nd_get_link(nd))) {
2078        struct page *page = cookie;
2079        kunmap(page);
2080        mark_page_accessed(page);
2081        page_cache_release(page);
2082    }
2083}
2084
2085#ifdef CONFIG_TMPFS_XATTR
2086/*
2087 * Superblocks without xattr inode operations may get some security.* xattr
2088 * support from the LSM "for free". As soon as we have any other xattrs
2089 * like ACLs, we also need to implement the security.* handlers at
2090 * filesystem level, though.
2091 */
2092
2093static int shmem_xattr_get(struct dentry *dentry, const char *name,
2094               void *buffer, size_t size)
2095{
2096    struct shmem_inode_info *info;
2097    struct shmem_xattr *xattr;
2098    int ret = -ENODATA;
2099
2100    info = SHMEM_I(dentry->d_inode);
2101
2102    spin_lock(&info->lock);
2103    list_for_each_entry(xattr, &info->xattr_list, list) {
2104        if (strcmp(name, xattr->name))
2105            continue;
2106
2107        ret = xattr->size;
2108        if (buffer) {
2109            if (size < xattr->size)
2110                ret = -ERANGE;
2111            else
2112                memcpy(buffer, xattr->value, xattr->size);
2113        }
2114        break;
2115    }
2116    spin_unlock(&info->lock);
2117    return ret;
2118}
2119
2120static int shmem_xattr_set(struct dentry *dentry, const char *name,
2121               const void *value, size_t size, int flags)
2122{
2123    struct inode *inode = dentry->d_inode;
2124    struct shmem_inode_info *info = SHMEM_I(inode);
2125    struct shmem_xattr *xattr;
2126    struct shmem_xattr *new_xattr = NULL;
2127    size_t len;
2128    int err = 0;
2129
2130    /* value == NULL means remove */
2131    if (value) {
2132        /* wrap around? */
2133        len = sizeof(*new_xattr) + size;
2134        if (len <= sizeof(*new_xattr))
2135            return -ENOMEM;
2136
2137        new_xattr = kmalloc(len, GFP_KERNEL);
2138        if (!new_xattr)
2139            return -ENOMEM;
2140
2141        new_xattr->name = kstrdup(name, GFP_KERNEL);
2142        if (!new_xattr->name) {
2143            kfree(new_xattr);
2144            return -ENOMEM;
2145        }
2146
2147        new_xattr->size = size;
2148        memcpy(new_xattr->value, value, size);
2149    }
2150
2151    spin_lock(&info->lock);
2152    list_for_each_entry(xattr, &info->xattr_list, list) {
2153        if (!strcmp(name, xattr->name)) {
2154            if (flags & XATTR_CREATE) {
2155                xattr = new_xattr;
2156                err = -EEXIST;
2157            } else if (new_xattr) {
2158                list_replace(&xattr->list, &new_xattr->list);
2159            } else {
2160                list_del(&xattr->list);
2161            }
2162            goto out;
2163        }
2164    }
2165    if (flags & XATTR_REPLACE) {
2166        xattr = new_xattr;
2167        err = -ENODATA;
2168    } else {
2169        list_add(&new_xattr->list, &info->xattr_list);
2170        xattr = NULL;
2171    }
2172out:
2173    spin_unlock(&info->lock);
2174    if (xattr)
2175        kfree(xattr->name);
2176    kfree(xattr);
2177    return err;
2178}
2179
2180
2181static const struct xattr_handler *shmem_xattr_handlers[] = {
2182#ifdef CONFIG_TMPFS_POSIX_ACL
2183    &generic_acl_access_handler,
2184    &generic_acl_default_handler,
2185#endif
2186    NULL
2187};
2188
2189static int shmem_xattr_validate(const char *name)
2190{
2191    struct { const char *prefix; size_t len; } arr[] = {
2192        { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
2193        { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
2194    };
2195    int i;
2196
2197    for (i = 0; i < ARRAY_SIZE(arr); i++) {
2198        size_t preflen = arr[i].len;
2199        if (strncmp(name, arr[i].prefix, preflen) == 0) {
2200            if (!name[preflen])
2201                return -EINVAL;
2202            return 0;
2203        }
2204    }
2205    return -EOPNOTSUPP;
2206}
2207
2208static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2209                  void *buffer, size_t size)
2210{
2211    int err;
2212
2213    /*
2214     * If this is a request for a synthetic attribute in the system.*
2215     * namespace use the generic infrastructure to resolve a handler
2216     * for it via sb->s_xattr.
2217     */
2218    if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2219        return generic_getxattr(dentry, name, buffer, size);
2220
2221    err = shmem_xattr_validate(name);
2222    if (err)
2223        return err;
2224
2225    return shmem_xattr_get(dentry, name, buffer, size);
2226}
2227
2228static int shmem_setxattr(struct dentry *dentry, const char *name,
2229              const void *value, size_t size, int flags)
2230{
2231    int err;
2232
2233    /*
2234     * If this is a request for a synthetic attribute in the system.*
2235     * namespace use the generic infrastructure to resolve a handler
2236     * for it via sb->s_xattr.
2237     */
2238    if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2239        return generic_setxattr(dentry, name, value, size, flags);
2240
2241    err = shmem_xattr_validate(name);
2242    if (err)
2243        return err;
2244
2245    if (size == 0)
2246        value = ""; /* empty EA, do not remove */
2247
2248    return shmem_xattr_set(dentry, name, value, size, flags);
2249
2250}
2251
2252static int shmem_removexattr(struct dentry *dentry, const char *name)
2253{
2254    int err;
2255
2256    /*
2257     * If this is a request for a synthetic attribute in the system.*
2258     * namespace use the generic infrastructure to resolve a handler
2259     * for it via sb->s_xattr.
2260     */
2261    if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2262        return generic_removexattr(dentry, name);
2263
2264    err = shmem_xattr_validate(name);
2265    if (err)
2266        return err;
2267
2268    return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
2269}
2270
2271static bool xattr_is_trusted(const char *name)
2272{
2273    return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
2274}
2275
2276static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2277{
2278    bool trusted = capable(CAP_SYS_ADMIN);
2279    struct shmem_xattr *xattr;
2280    struct shmem_inode_info *info;
2281    size_t used = 0;
2282
2283    info = SHMEM_I(dentry->d_inode);
2284
2285    spin_lock(&info->lock);
2286    list_for_each_entry(xattr, &info->xattr_list, list) {
2287        size_t len;
2288
2289        /* skip "trusted." attributes for unprivileged callers */
2290        if (!trusted && xattr_is_trusted(xattr->name))
2291            continue;
2292
2293        len = strlen(xattr->name) + 1;
2294        used += len;
2295        if (buffer) {
2296            if (size < used) {
2297                used = -ERANGE;
2298                break;
2299            }
2300            memcpy(buffer, xattr->name, len);
2301            buffer += len;
2302        }
2303    }
2304    spin_unlock(&info->lock);
2305
2306    return used;
2307}
2308#endif /* CONFIG_TMPFS_XATTR */
2309
2310static const struct inode_operations shmem_symlink_inline_operations = {
2311    .readlink = generic_readlink,
2312    .follow_link = shmem_follow_link_inline,
2313#ifdef CONFIG_TMPFS_XATTR
2314    .setxattr = shmem_setxattr,
2315    .getxattr = shmem_getxattr,
2316    .listxattr = shmem_listxattr,
2317    .removexattr = shmem_removexattr,
2318#endif
2319};
2320
2321static const struct inode_operations shmem_symlink_inode_operations = {
2322    .readlink = generic_readlink,
2323    .follow_link = shmem_follow_link,
2324    .put_link = shmem_put_link,
2325#ifdef CONFIG_TMPFS_XATTR
2326    .setxattr = shmem_setxattr,
2327    .getxattr = shmem_getxattr,
2328    .listxattr = shmem_listxattr,
2329    .removexattr = shmem_removexattr,
2330#endif
2331};
2332
2333static struct dentry *shmem_get_parent(struct dentry *child)
2334{
2335    return ERR_PTR(-ESTALE);
2336}
2337
2338static int shmem_match(struct inode *ino, void *vfh)
2339{
2340    __u32 *fh = vfh;
2341    __u64 inum = fh[2];
2342    inum = (inum << 32) | fh[1];
2343    return ino->i_ino == inum && fh[0] == ino->i_generation;
2344}
2345
2346static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2347        struct fid *fid, int fh_len, int fh_type)
2348{
2349    struct inode *inode;
2350    struct dentry *dentry = NULL;
2351    u64 inum = fid->raw[2];
2352    inum = (inum << 32) | fid->raw[1];
2353
2354    if (fh_len < 3)
2355        return NULL;
2356
2357    inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2358            shmem_match, fid->raw);
2359    if (inode) {
2360        dentry = d_find_alias(inode);
2361        iput(inode);
2362    }
2363
2364    return dentry;
2365}
2366
2367static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
2368                int connectable)
2369{
2370    struct inode *inode = dentry->d_inode;
2371
2372    if (*len < 3) {
2373        *len = 3;
2374        return 255;
2375    }
2376
2377    if (inode_unhashed(inode)) {
2378        /* Unfortunately insert_inode_hash is not idempotent,
2379         * so as we hash inodes here rather than at creation
2380         * time, we need a lock to ensure we only try
2381         * to do it once
2382         */
2383        static DEFINE_SPINLOCK(lock);
2384        spin_lock(&lock);
2385        if (inode_unhashed(inode))
2386            __insert_inode_hash(inode,
2387                        inode->i_ino + inode->i_generation);
2388        spin_unlock(&lock);
2389    }
2390
2391    fh[0] = inode->i_generation;
2392    fh[1] = inode->i_ino;
2393    fh[2] = ((__u64)inode->i_ino) >> 32;
2394
2395    *len = 3;
2396    return 1;
2397}
2398
2399static const struct export_operations shmem_export_ops = {
2400    .get_parent = shmem_get_parent,
2401    .encode_fh = shmem_encode_fh,
2402    .fh_to_dentry = shmem_fh_to_dentry,
2403};
2404
2405static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2406                   bool remount)
2407{
2408    char *this_char, *value, *rest;
2409
2410    while (options != NULL) {
2411        this_char = options;
2412        for (;;) {
2413            /*
2414             * NUL-terminate this option: unfortunately,
2415             * mount options form a comma-separated list,
2416             * but mpol's nodelist may also contain commas.
2417             */
2418            options = strchr(options, ',');
2419            if (options == NULL)
2420                break;
2421            options++;
2422            if (!isdigit(*options)) {
2423                options[-1] = '\0';
2424                break;
2425            }
2426        }
2427        if (!*this_char)
2428            continue;
2429        if ((value = strchr(this_char,'=')) != NULL) {
2430            *value++ = 0;
2431        } else {
2432            printk(KERN_ERR
2433                "tmpfs: No value for mount option '%s'\n",
2434                this_char);
2435            return 1;
2436        }
2437
2438        if (!strcmp(this_char,"size")) {
2439            unsigned long long size;
2440            size = memparse(value,&rest);
2441            if (*rest == '%') {
2442                size <<= PAGE_SHIFT;
2443                size *= totalram_pages;
2444                do_div(size, 100);
2445                rest++;
2446            }
2447            if (*rest)
2448                goto bad_val;
2449            sbinfo->max_blocks =
2450                DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
2451        } else if (!strcmp(this_char,"nr_blocks")) {
2452            sbinfo->max_blocks = memparse(value, &rest);
2453            if (*rest)
2454                goto bad_val;
2455        } else if (!strcmp(this_char,"nr_inodes")) {
2456            sbinfo->max_inodes = memparse(value, &rest);
2457            if (*rest)
2458                goto bad_val;
2459        } else if (!strcmp(this_char,"mode")) {
2460            if (remount)
2461                continue;
2462            sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
2463            if (*rest)
2464                goto bad_val;
2465        } else if (!strcmp(this_char,"uid")) {
2466            if (remount)
2467                continue;
2468            sbinfo->uid = simple_strtoul(value, &rest, 0);
2469            if (*rest)
2470                goto bad_val;
2471        } else if (!strcmp(this_char,"gid")) {
2472            if (remount)
2473                continue;
2474            sbinfo->gid = simple_strtoul(value, &rest, 0);
2475            if (*rest)
2476                goto bad_val;
2477        } else if (!strcmp(this_char,"mpol")) {
2478            if (mpol_parse_str(value, &sbinfo->mpol, 1))
2479                goto bad_val;
2480        } else {
2481            printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2482                   this_char);
2483            return 1;
2484        }
2485    }
2486    return 0;
2487
2488bad_val:
2489    printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2490           value, this_char);
2491    return 1;
2492
2493}
2494
2495static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2496{
2497    struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2498    struct shmem_sb_info config = *sbinfo;
2499    unsigned long inodes;
2500    int error = -EINVAL;
2501
2502    if (shmem_parse_options(data, &config, true))
2503        return error;
2504
2505    spin_lock(&sbinfo->stat_lock);
2506    inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2507    if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
2508        goto out;
2509    if (config.max_inodes < inodes)
2510        goto out;
2511    /*
2512     * Those tests also disallow limited->unlimited while any are in
2513     * use, so i_blocks will always be zero when max_blocks is zero;
2514     * but we must separately disallow unlimited->limited, because
2515     * in that case we have no record of how much is already in use.
2516     */
2517    if (config.max_blocks && !sbinfo->max_blocks)
2518        goto out;
2519    if (config.max_inodes && !sbinfo->max_inodes)
2520        goto out;
2521
2522    error = 0;
2523    sbinfo->max_blocks = config.max_blocks;
2524    sbinfo->max_inodes = config.max_inodes;
2525    sbinfo->free_inodes = config.max_inodes - inodes;
2526
2527    mpol_put(sbinfo->mpol);
2528    sbinfo->mpol = config.mpol; /* transfers initial ref */
2529out:
2530    spin_unlock(&sbinfo->stat_lock);
2531    return error;
2532}
2533
2534static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
2535{
2536    struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb);
2537
2538    if (sbinfo->max_blocks != shmem_default_max_blocks())
2539        seq_printf(seq, ",size=%luk",
2540            sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
2541    if (sbinfo->max_inodes != shmem_default_max_inodes())
2542        seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
2543    if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
2544        seq_printf(seq, ",mode=%03o", sbinfo->mode);
2545    if (sbinfo->uid != 0)
2546        seq_printf(seq, ",uid=%u", sbinfo->uid);
2547    if (sbinfo->gid != 0)
2548        seq_printf(seq, ",gid=%u", sbinfo->gid);
2549    shmem_show_mpol(seq, sbinfo->mpol);
2550    return 0;
2551}
2552#endif /* CONFIG_TMPFS */
2553
2554static void shmem_put_super(struct super_block *sb)
2555{
2556    struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2557
2558    percpu_counter_destroy(&sbinfo->used_blocks);
2559    kfree(sbinfo);
2560    sb->s_fs_info = NULL;
2561}
2562
2563int shmem_fill_super(struct super_block *sb, void *data, int silent)
2564{
2565    struct inode *inode;
2566    struct dentry *root;
2567    struct shmem_sb_info *sbinfo;
2568    int err = -ENOMEM;
2569
2570    /* Round up to L1_CACHE_BYTES to resist false sharing */
2571    sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
2572                L1_CACHE_BYTES), GFP_KERNEL);
2573    if (!sbinfo)
2574        return -ENOMEM;
2575
2576    sbinfo->mode = S_IRWXUGO | S_ISVTX;
2577    sbinfo->uid = current_fsuid();
2578    sbinfo->gid = current_fsgid();
2579    sb->s_fs_info = sbinfo;
2580
2581#ifdef CONFIG_TMPFS
2582    /*
2583     * Per default we only allow half of the physical ram per
2584     * tmpfs instance, limiting inodes to one per page of lowmem;
2585     * but the internal instance is left unlimited.
2586     */
2587    if (!(sb->s_flags & MS_NOUSER)) {
2588        sbinfo->max_blocks = shmem_default_max_blocks();
2589        sbinfo->max_inodes = shmem_default_max_inodes();
2590        if (shmem_parse_options(data, sbinfo, false)) {
2591            err = -EINVAL;
2592            goto failed;
2593        }
2594    }
2595    sb->s_export_op = &shmem_export_ops;
2596#else
2597    sb->s_flags |= MS_NOUSER;
2598#endif
2599
2600    spin_lock_init(&sbinfo->stat_lock);
2601    if (percpu_counter_init(&sbinfo->used_blocks, 0))
2602        goto failed;
2603    sbinfo->free_inodes = sbinfo->max_inodes;
2604
2605    sb->s_maxbytes = SHMEM_MAX_BYTES;
2606    sb->s_blocksize = PAGE_CACHE_SIZE;
2607    sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2608    sb->s_magic = TMPFS_MAGIC;
2609    sb->s_op = &shmem_ops;
2610    sb->s_time_gran = 1;
2611#ifdef CONFIG_TMPFS_XATTR
2612    sb->s_xattr = shmem_xattr_handlers;
2613#endif
2614#ifdef CONFIG_TMPFS_POSIX_ACL
2615    sb->s_flags |= MS_POSIXACL;
2616#endif
2617
2618    inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
2619    if (!inode)
2620        goto failed;
2621    inode->i_uid = sbinfo->uid;
2622    inode->i_gid = sbinfo->gid;
2623    root = d_alloc_root(inode);
2624    if (!root)
2625        goto failed_iput;
2626    sb->s_root = root;
2627    return 0;
2628
2629failed_iput:
2630    iput(inode);
2631failed:
2632    shmem_put_super(sb);
2633    return err;
2634}
2635
2636static struct kmem_cache *shmem_inode_cachep;
2637
2638static struct inode *shmem_alloc_inode(struct super_block *sb)
2639{
2640    struct shmem_inode_info *p;
2641    p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2642    if (!p)
2643        return NULL;
2644    return &p->vfs_inode;
2645}
2646
2647static void shmem_i_callback(struct rcu_head *head)
2648{
2649    struct inode *inode = container_of(head, struct inode, i_rcu);
2650    INIT_LIST_HEAD(&inode->i_dentry);
2651    kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2652}
2653
2654static void shmem_destroy_inode(struct inode *inode)
2655{
2656    if ((inode->i_mode & S_IFMT) == S_IFREG) {
2657        /* only struct inode is valid if it's an inline symlink */
2658        mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2659    }
2660    call_rcu(&inode->i_rcu, shmem_i_callback);
2661}
2662
2663static void init_once(void *foo)
2664{
2665    struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2666
2667    inode_init_once(&p->vfs_inode);
2668}
2669
2670static int init_inodecache(void)
2671{
2672    shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2673                sizeof(struct shmem_inode_info),
2674                0, SLAB_PANIC, init_once);
2675    return 0;
2676}
2677
2678static void destroy_inodecache(void)
2679{
2680    kmem_cache_destroy(shmem_inode_cachep);
2681}
2682
2683static const struct address_space_operations shmem_aops = {
2684    .writepage = shmem_writepage,
2685    .set_page_dirty = __set_page_dirty_no_writeback,
2686#ifdef CONFIG_TMPFS
2687    .readpage = shmem_readpage,
2688    .write_begin = shmem_write_begin,
2689    .write_end = shmem_write_end,
2690#endif
2691    .migratepage = migrate_page,
2692    .error_remove_page = generic_error_remove_page,
2693};
2694
2695static const struct file_operations shmem_file_operations = {
2696    .mmap = shmem_mmap,
2697#ifdef CONFIG_TMPFS
2698    .llseek = generic_file_llseek,
2699    .read = do_sync_read,
2700    .write = do_sync_write,
2701    .aio_read = shmem_file_aio_read,
2702    .aio_write = generic_file_aio_write,
2703    .fsync = noop_fsync,
2704    .splice_read = generic_file_splice_read,
2705    .splice_write = generic_file_splice_write,
2706#endif
2707};
2708
2709static const struct inode_operations shmem_inode_operations = {
2710    .setattr = shmem_setattr,
2711    .truncate_range = shmem_truncate_range,
2712#ifdef CONFIG_TMPFS_XATTR
2713    .setxattr = shmem_setxattr,
2714    .getxattr = shmem_getxattr,
2715    .listxattr = shmem_listxattr,
2716    .removexattr = shmem_removexattr,
2717#endif
2718#ifdef CONFIG_TMPFS_POSIX_ACL
2719    .check_acl = generic_check_acl,
2720#endif
2721
2722};
2723
2724static const struct inode_operations shmem_dir_inode_operations = {
2725#ifdef CONFIG_TMPFS
2726    .create = shmem_create,
2727    .lookup = simple_lookup,
2728    .link = shmem_link,
2729    .unlink = shmem_unlink,
2730    .symlink = shmem_symlink,
2731    .mkdir = shmem_mkdir,
2732    .rmdir = shmem_rmdir,
2733    .mknod = shmem_mknod,
2734    .rename = shmem_rename,
2735#endif
2736#ifdef CONFIG_TMPFS_XATTR
2737    .setxattr = shmem_setxattr,
2738    .getxattr = shmem_getxattr,
2739    .listxattr = shmem_listxattr,
2740    .removexattr = shmem_removexattr,
2741#endif
2742#ifdef CONFIG_TMPFS_POSIX_ACL
2743    .setattr = shmem_setattr,
2744    .check_acl = generic_check_acl,
2745#endif
2746};
2747
2748static const struct inode_operations shmem_special_inode_operations = {
2749#ifdef CONFIG_TMPFS_XATTR
2750    .setxattr = shmem_setxattr,
2751    .getxattr = shmem_getxattr,
2752    .listxattr = shmem_listxattr,
2753    .removexattr = shmem_removexattr,
2754#endif
2755#ifdef CONFIG_TMPFS_POSIX_ACL
2756    .setattr = shmem_setattr,
2757    .check_acl = generic_check_acl,
2758#endif
2759};
2760
2761static const struct super_operations shmem_ops = {
2762    .alloc_inode = shmem_alloc_inode,
2763    .destroy_inode = shmem_destroy_inode,
2764#ifdef CONFIG_TMPFS
2765    .statfs = shmem_statfs,
2766    .remount_fs = shmem_remount_fs,
2767    .show_options = shmem_show_options,
2768#endif
2769    .evict_inode = shmem_evict_inode,
2770    .drop_inode = generic_delete_inode,
2771    .put_super = shmem_put_super,
2772};
2773
2774static const struct vm_operations_struct shmem_vm_ops = {
2775    .fault = shmem_fault,
2776#ifdef CONFIG_NUMA
2777    .set_policy = shmem_set_policy,
2778    .get_policy = shmem_get_policy,
2779#endif
2780};
2781
2782
2783static struct dentry *shmem_mount(struct file_system_type *fs_type,
2784    int flags, const char *dev_name, void *data)
2785{
2786    return mount_nodev(fs_type, flags, data, shmem_fill_super);
2787}
2788
2789static struct file_system_type tmpfs_fs_type = {
2790    .owner = THIS_MODULE,
2791    .name = "tmpfs",
2792    .mount = shmem_mount,
2793    .kill_sb = kill_litter_super,
2794};
2795
2796int __init init_tmpfs(void)
2797{
2798    int error;
2799
2800    error = bdi_init(&shmem_backing_dev_info);
2801    if (error)
2802        goto out4;
2803
2804    error = init_inodecache();
2805    if (error)
2806        goto out3;
2807
2808    error = register_filesystem(&tmpfs_fs_type);
2809    if (error) {
2810        printk(KERN_ERR "Could not register tmpfs\n");
2811        goto out2;
2812    }
2813
2814    shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
2815                tmpfs_fs_type.name, NULL);
2816    if (IS_ERR(shm_mnt)) {
2817        error = PTR_ERR(shm_mnt);
2818        printk(KERN_ERR "Could not kern_mount tmpfs\n");
2819        goto out1;
2820    }
2821    return 0;
2822
2823out1:
2824    unregister_filesystem(&tmpfs_fs_type);
2825out2:
2826    destroy_inodecache();
2827out3:
2828    bdi_destroy(&shmem_backing_dev_info);
2829out4:
2830    shm_mnt = ERR_PTR(error);
2831    return error;
2832}
2833
2834#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2835/**
2836 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2837 * @inode: the inode to be searched
2838 * @pgoff: the offset to be searched
2839 * @pagep: the pointer for the found page to be stored
2840 * @ent: the pointer for the found swap entry to be stored
2841 *
2842 * If a page is found, refcount of it is incremented. Callers should handle
2843 * these refcount.
2844 */
2845void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2846                    struct page **pagep, swp_entry_t *ent)
2847{
2848    swp_entry_t entry = { .val = 0 }, *ptr;
2849    struct page *page = NULL;
2850    struct shmem_inode_info *info = SHMEM_I(inode);
2851
2852    if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2853        goto out;
2854
2855    spin_lock(&info->lock);
2856    ptr = shmem_swp_entry(info, pgoff, NULL);
2857#ifdef CONFIG_SWAP
2858    if (ptr && ptr->val) {
2859        entry.val = ptr->val;
2860        page = find_get_page(&swapper_space, entry.val);
2861    } else
2862#endif
2863        page = find_get_page(inode->i_mapping, pgoff);
2864    if (ptr)
2865        shmem_swp_unmap(ptr);
2866    spin_unlock(&info->lock);
2867out:
2868    *pagep = page;
2869    *ent = entry;
2870}
2871#endif
2872
2873#else /* !CONFIG_SHMEM */
2874
2875/*
2876 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
2877 *
2878 * This is intended for small system where the benefits of the full
2879 * shmem code (swap-backed and resource-limited) are outweighed by
2880 * their complexity. On systems without swap this code should be
2881 * effectively equivalent, but much lighter weight.
2882 */
2883
2884#include <linux/ramfs.h>
2885
2886static struct file_system_type tmpfs_fs_type = {
2887    .name = "tmpfs",
2888    .mount = ramfs_mount,
2889    .kill_sb = kill_litter_super,
2890};
2891
2892int __init init_tmpfs(void)
2893{
2894    BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2895
2896    shm_mnt = kern_mount(&tmpfs_fs_type);
2897    BUG_ON(IS_ERR(shm_mnt));
2898
2899    return 0;
2900}
2901
2902int shmem_unuse(swp_entry_t entry, struct page *page)
2903{
2904    return 0;
2905}
2906
2907int shmem_lock(struct file *file, int lock, struct user_struct *user)
2908{
2909    return 0;
2910}
2911
2912void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
2913{
2914    truncate_inode_pages_range(inode->i_mapping, start, end);
2915}
2916EXPORT_SYMBOL_GPL(shmem_truncate_range);
2917
2918#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2919/**
2920 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2921 * @inode: the inode to be searched
2922 * @pgoff: the offset to be searched
2923 * @pagep: the pointer for the found page to be stored
2924 * @ent: the pointer for the found swap entry to be stored
2925 *
2926 * If a page is found, refcount of it is incremented. Callers should handle
2927 * these refcount.
2928 */
2929void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2930                    struct page **pagep, swp_entry_t *ent)
2931{
2932    struct page *page = NULL;
2933
2934    if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2935        goto out;
2936    page = find_get_page(inode->i_mapping, pgoff);
2937out:
2938    *pagep = page;
2939    *ent = (swp_entry_t){ .val = 0 };
2940}
2941#endif
2942
2943#define shmem_vm_ops generic_file_vm_ops
2944#define shmem_file_operations ramfs_file_operations
2945#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2946#define shmem_acct_size(flags, size) 0
2947#define shmem_unacct_size(flags, size) do {} while (0)
2948#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2949
2950#endif /* CONFIG_SHMEM */
2951
2952/* common code */
2953
2954/**
2955 * shmem_file_setup - get an unlinked file living in tmpfs
2956 * @name: name for dentry (to be seen in /proc/<pid>/maps
2957 * @size: size to be set for the file
2958 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
2959 */
2960struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
2961{
2962    int error;
2963    struct file *file;
2964    struct inode *inode;
2965    struct path path;
2966    struct dentry *root;
2967    struct qstr this;
2968
2969    if (IS_ERR(shm_mnt))
2970        return (void *)shm_mnt;
2971
2972    if (size < 0 || size > SHMEM_MAX_BYTES)
2973        return ERR_PTR(-EINVAL);
2974
2975    if (shmem_acct_size(flags, size))
2976        return ERR_PTR(-ENOMEM);
2977
2978    error = -ENOMEM;
2979    this.name = name;
2980    this.len = strlen(name);
2981    this.hash = 0; /* will go */
2982    root = shm_mnt->mnt_root;
2983    path.dentry = d_alloc(root, &this);
2984    if (!path.dentry)
2985        goto put_memory;
2986    path.mnt = mntget(shm_mnt);
2987
2988    error = -ENOSPC;
2989    inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
2990    if (!inode)
2991        goto put_dentry;
2992
2993    d_instantiate(path.dentry, inode);
2994    inode->i_size = size;
2995    inode->i_nlink = 0; /* It is unlinked */
2996#ifndef CONFIG_MMU
2997    error = ramfs_nommu_expand_for_mapping(inode, size);
2998    if (error)
2999        goto put_dentry;
3000#endif
3001
3002    error = -ENFILE;
3003    file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
3004          &shmem_file_operations);
3005    if (!file)
3006        goto put_dentry;
3007
3008    return file;
3009
3010put_dentry:
3011    path_put(&path);
3012put_memory:
3013    shmem_unacct_size(flags, size);
3014    return ERR_PTR(error);
3015}
3016EXPORT_SYMBOL_GPL(shmem_file_setup);
3017
3018/**
3019 * shmem_zero_setup - setup a shared anonymous mapping
3020 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
3021 */
3022int shmem_zero_setup(struct vm_area_struct *vma)
3023{
3024    struct file *file;
3025    loff_t size = vma->vm_end - vma->vm_start;
3026
3027    file = shmem_file_setup("dev/zero", size, vma->vm_flags);
3028    if (IS_ERR(file))
3029        return PTR_ERR(file);
3030
3031    if (vma->vm_file)
3032        fput(vma->vm_file);
3033    vma->vm_file = file;
3034    vma->vm_ops = &shmem_vm_ops;
3035    vma->vm_flags |= VM_CAN_NONLINEAR;
3036    return 0;
3037}
3038
3039/**
3040 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
3041 * @mapping: the page's address_space
3042 * @index: the page index
3043 * @gfp: the page allocator flags to use if allocating
3044 *
3045 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
3046 * with any new page allocations done using the specified allocation flags.
3047 * But read_cache_page_gfp() uses the ->readpage() method: which does not
3048 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3049 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3050 *
3051 * Provide a stub for those callers to start using now, then later
3052 * flesh it out to call shmem_getpage() with additional gfp mask, when
3053 * shmem_file_splice_read() is added and shmem_readpage() is removed.
3054 */
3055struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3056                     pgoff_t index, gfp_t gfp)
3057{
3058    return read_cache_page_gfp(mapping, index, gfp);
3059}
3060EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
3061

Archive Download this file



interactive