Root/fs/namei.c

1/*
2 * linux/fs/namei.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7/*
8 * Some corrections by tytso.
9 */
10
11/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
12 * lookup logic.
13 */
14/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
15 */
16
17#include <linux/init.h>
18#include <linux/module.h>
19#include <linux/slab.h>
20#include <linux/fs.h>
21#include <linux/namei.h>
22#include <linux/pagemap.h>
23#include <linux/fsnotify.h>
24#include <linux/personality.h>
25#include <linux/security.h>
26#include <linux/ima.h>
27#include <linux/syscalls.h>
28#include <linux/mount.h>
29#include <linux/audit.h>
30#include <linux/capability.h>
31#include <linux/file.h>
32#include <linux/fcntl.h>
33#include <linux/device_cgroup.h>
34#include <linux/fs_struct.h>
35#include <asm/uaccess.h>
36
37#include "internal.h"
38
39/* [Feb-1997 T. Schoebel-Theuer]
40 * Fundamental changes in the pathname lookup mechanisms (namei)
41 * were necessary because of omirr. The reason is that omirr needs
42 * to know the _real_ pathname, not the user-supplied one, in case
43 * of symlinks (and also when transname replacements occur).
44 *
45 * The new code replaces the old recursive symlink resolution with
46 * an iterative one (in case of non-nested symlink chains). It does
47 * this with calls to <fs>_follow_link().
48 * As a side effect, dir_namei(), _namei() and follow_link() are now
49 * replaced with a single function lookup_dentry() that can handle all
50 * the special cases of the former code.
51 *
52 * With the new dcache, the pathname is stored at each inode, at least as
53 * long as the refcount of the inode is positive. As a side effect, the
54 * size of the dcache depends on the inode cache and thus is dynamic.
55 *
56 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
57 * resolution to correspond with current state of the code.
58 *
59 * Note that the symlink resolution is not *completely* iterative.
60 * There is still a significant amount of tail- and mid- recursion in
61 * the algorithm. Also, note that <fs>_readlink() is not used in
62 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
63 * may return different results than <fs>_follow_link(). Many virtual
64 * filesystems (including /proc) exhibit this behavior.
65 */
66
67/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
68 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
69 * and the name already exists in form of a symlink, try to create the new
70 * name indicated by the symlink. The old code always complained that the
71 * name already exists, due to not following the symlink even if its target
72 * is nonexistent. The new semantics affects also mknod() and link() when
73 * the name is a symlink pointing to a non-existant name.
74 *
75 * I don't know which semantics is the right one, since I have no access
76 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
77 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
78 * "old" one. Personally, I think the new semantics is much more logical.
79 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
80 * file does succeed in both HP-UX and SunOs, but not in Solaris
81 * and in the old Linux semantics.
82 */
83
84/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
85 * semantics. See the comments in "open_namei" and "do_link" below.
86 *
87 * [10-Sep-98 Alan Modra] Another symlink change.
88 */
89
90/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
91 * inside the path - always follow.
92 * in the last component in creation/removal/renaming - never follow.
93 * if LOOKUP_FOLLOW passed - follow.
94 * if the pathname has trailing slashes - follow.
95 * otherwise - don't follow.
96 * (applied in that order).
97 *
98 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
99 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
100 * During the 2.4 we need to fix the userland stuff depending on it -
101 * hopefully we will be able to get rid of that wart in 2.5. So far only
102 * XEmacs seems to be relying on it...
103 */
104/*
105 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
106 * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
107 * any extra contention...
108 */
109
110/* In order to reduce some races, while at the same time doing additional
111 * checking and hopefully speeding things up, we copy filenames to the
112 * kernel data space before using them..
113 *
114 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
115 * PATH_MAX includes the nul terminator --RR.
116 */
117static int do_getname(const char __user *filename, char *page)
118{
119    int retval;
120    unsigned long len = PATH_MAX;
121
122    if (!segment_eq(get_fs(), KERNEL_DS)) {
123        if ((unsigned long) filename >= TASK_SIZE)
124            return -EFAULT;
125        if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
126            len = TASK_SIZE - (unsigned long) filename;
127    }
128
129    retval = strncpy_from_user(page, filename, len);
130    if (retval > 0) {
131        if (retval < len)
132            return 0;
133        return -ENAMETOOLONG;
134    } else if (!retval)
135        retval = -ENOENT;
136    return retval;
137}
138
139char * getname(const char __user * filename)
140{
141    char *tmp, *result;
142
143    result = ERR_PTR(-ENOMEM);
144    tmp = __getname();
145    if (tmp) {
146        int retval = do_getname(filename, tmp);
147
148        result = tmp;
149        if (retval < 0) {
150            __putname(tmp);
151            result = ERR_PTR(retval);
152        }
153    }
154    audit_getname(result);
155    return result;
156}
157
158#ifdef CONFIG_AUDITSYSCALL
159void putname(const char *name)
160{
161    if (unlikely(!audit_dummy_context()))
162        audit_putname(name);
163    else
164        __putname(name);
165}
166EXPORT_SYMBOL(putname);
167#endif
168
169/*
170 * This does basic POSIX ACL permission checking
171 */
172static int acl_permission_check(struct inode *inode, int mask,
173        int (*check_acl)(struct inode *inode, int mask))
174{
175    umode_t mode = inode->i_mode;
176
177    mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
178
179    if (current_fsuid() == inode->i_uid)
180        mode >>= 6;
181    else {
182        if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
183            int error = check_acl(inode, mask);
184            if (error != -EAGAIN)
185                return error;
186        }
187
188        if (in_group_p(inode->i_gid))
189            mode >>= 3;
190    }
191
192    /*
193     * If the DACs are ok we don't need any capability check.
194     */
195    if ((mask & ~mode) == 0)
196        return 0;
197    return -EACCES;
198}
199
200/**
201 * generic_permission - check for access rights on a Posix-like filesystem
202 * @inode: inode to check access rights for
203 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
204 * @check_acl: optional callback to check for Posix ACLs
205 *
206 * Used to check for read/write/execute permissions on a file.
207 * We use "fsuid" for this, letting us set arbitrary permissions
208 * for filesystem access without changing the "normal" uids which
209 * are used for other things..
210 */
211int generic_permission(struct inode *inode, int mask,
212        int (*check_acl)(struct inode *inode, int mask))
213{
214    int ret;
215
216    /*
217     * Do the basic POSIX ACL permission checks.
218     */
219    ret = acl_permission_check(inode, mask, check_acl);
220    if (ret != -EACCES)
221        return ret;
222
223    /*
224     * Read/write DACs are always overridable.
225     * Executable DACs are overridable if at least one exec bit is set.
226     */
227    if (!(mask & MAY_EXEC) || execute_ok(inode))
228        if (capable(CAP_DAC_OVERRIDE))
229            return 0;
230
231    /*
232     * Searching includes executable on directories, else just read.
233     */
234    mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
235    if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
236        if (capable(CAP_DAC_READ_SEARCH))
237            return 0;
238
239    return -EACCES;
240}
241
242/**
243 * inode_permission - check for access rights to a given inode
244 * @inode: inode to check permission on
245 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
246 *
247 * Used to check for read/write/execute permissions on an inode.
248 * We use "fsuid" for this, letting us set arbitrary permissions
249 * for filesystem access without changing the "normal" uids which
250 * are used for other things.
251 */
252int inode_permission(struct inode *inode, int mask)
253{
254    int retval;
255
256    if (mask & MAY_WRITE) {
257        umode_t mode = inode->i_mode;
258
259        /*
260         * Nobody gets write access to a read-only fs.
261         */
262        if (IS_RDONLY(inode) &&
263            (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
264            return -EROFS;
265
266        /*
267         * Nobody gets write access to an immutable file.
268         */
269        if (IS_IMMUTABLE(inode))
270            return -EACCES;
271    }
272
273    if (inode->i_op->permission)
274        retval = inode->i_op->permission(inode, mask);
275    else
276        retval = generic_permission(inode, mask, inode->i_op->check_acl);
277
278    if (retval)
279        return retval;
280
281    retval = devcgroup_inode_permission(inode, mask);
282    if (retval)
283        return retval;
284
285    return security_inode_permission(inode,
286            mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
287}
288
289/**
290 * file_permission - check for additional access rights to a given file
291 * @file: file to check access rights for
292 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
293 *
294 * Used to check for read/write/execute permissions on an already opened
295 * file.
296 *
297 * Note:
298 * Do not use this function in new code. All access checks should
299 * be done using inode_permission().
300 */
301int file_permission(struct file *file, int mask)
302{
303    return inode_permission(file->f_path.dentry->d_inode, mask);
304}
305
306/*
307 * get_write_access() gets write permission for a file.
308 * put_write_access() releases this write permission.
309 * This is used for regular files.
310 * We cannot support write (and maybe mmap read-write shared) accesses and
311 * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
312 * can have the following values:
313 * 0: no writers, no VM_DENYWRITE mappings
314 * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
315 * > 0: (i_writecount) users are writing to the file.
316 *
317 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
318 * except for the cases where we don't hold i_writecount yet. Then we need to
319 * use {get,deny}_write_access() - these functions check the sign and refuse
320 * to do the change if sign is wrong. Exclusion between them is provided by
321 * the inode->i_lock spinlock.
322 */
323
324int get_write_access(struct inode * inode)
325{
326    spin_lock(&inode->i_lock);
327    if (atomic_read(&inode->i_writecount) < 0) {
328        spin_unlock(&inode->i_lock);
329        return -ETXTBSY;
330    }
331    atomic_inc(&inode->i_writecount);
332    spin_unlock(&inode->i_lock);
333
334    return 0;
335}
336
337int deny_write_access(struct file * file)
338{
339    struct inode *inode = file->f_path.dentry->d_inode;
340
341    spin_lock(&inode->i_lock);
342    if (atomic_read(&inode->i_writecount) > 0) {
343        spin_unlock(&inode->i_lock);
344        return -ETXTBSY;
345    }
346    atomic_dec(&inode->i_writecount);
347    spin_unlock(&inode->i_lock);
348
349    return 0;
350}
351
352/**
353 * path_get - get a reference to a path
354 * @path: path to get the reference to
355 *
356 * Given a path increment the reference count to the dentry and the vfsmount.
357 */
358void path_get(struct path *path)
359{
360    mntget(path->mnt);
361    dget(path->dentry);
362}
363EXPORT_SYMBOL(path_get);
364
365/**
366 * path_put - put a reference to a path
367 * @path: path to put the reference to
368 *
369 * Given a path decrement the reference count to the dentry and the vfsmount.
370 */
371void path_put(struct path *path)
372{
373    dput(path->dentry);
374    mntput(path->mnt);
375}
376EXPORT_SYMBOL(path_put);
377
378/**
379 * release_open_intent - free up open intent resources
380 * @nd: pointer to nameidata
381 */
382void release_open_intent(struct nameidata *nd)
383{
384    if (nd->intent.open.file->f_path.dentry == NULL)
385        put_filp(nd->intent.open.file);
386    else
387        fput(nd->intent.open.file);
388}
389
390static inline struct dentry *
391do_revalidate(struct dentry *dentry, struct nameidata *nd)
392{
393    int status = dentry->d_op->d_revalidate(dentry, nd);
394    if (unlikely(status <= 0)) {
395        /*
396         * The dentry failed validation.
397         * If d_revalidate returned 0 attempt to invalidate
398         * the dentry otherwise d_revalidate is asking us
399         * to return a fail status.
400         */
401        if (!status) {
402            if (!d_invalidate(dentry)) {
403                dput(dentry);
404                dentry = NULL;
405            }
406        } else {
407            dput(dentry);
408            dentry = ERR_PTR(status);
409        }
410    }
411    return dentry;
412}
413
414/*
415 * force_reval_path - force revalidation of a dentry
416 *
417 * In some situations the path walking code will trust dentries without
418 * revalidating them. This causes problems for filesystems that depend on
419 * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
420 * (which indicates that it's possible for the dentry to go stale), force
421 * a d_revalidate call before proceeding.
422 *
423 * Returns 0 if the revalidation was successful. If the revalidation fails,
424 * either return the error returned by d_revalidate or -ESTALE if the
425 * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
426 * invalidate the dentry. It's up to the caller to handle putting references
427 * to the path if necessary.
428 */
429static int
430force_reval_path(struct path *path, struct nameidata *nd)
431{
432    int status;
433    struct dentry *dentry = path->dentry;
434
435    /*
436     * only check on filesystems where it's possible for the dentry to
437     * become stale. It's assumed that if this flag is set then the
438     * d_revalidate op will also be defined.
439     */
440    if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
441        return 0;
442
443    status = dentry->d_op->d_revalidate(dentry, nd);
444    if (status > 0)
445        return 0;
446
447    if (!status) {
448        d_invalidate(dentry);
449        status = -ESTALE;
450    }
451    return status;
452}
453
454/*
455 * Short-cut version of permission(), for calling on directories
456 * during pathname resolution. Combines parts of permission()
457 * and generic_permission(), and tests ONLY for MAY_EXEC permission.
458 *
459 * If appropriate, check DAC only. If not appropriate, or
460 * short-cut DAC fails, then call ->permission() to do more
461 * complete permission check.
462 */
463static int exec_permission(struct inode *inode)
464{
465    int ret;
466
467    if (inode->i_op->permission) {
468        ret = inode->i_op->permission(inode, MAY_EXEC);
469        if (!ret)
470            goto ok;
471        return ret;
472    }
473    ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
474    if (!ret)
475        goto ok;
476
477    if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
478        goto ok;
479
480    return ret;
481ok:
482    return security_inode_permission(inode, MAY_EXEC);
483}
484
485static __always_inline void set_root(struct nameidata *nd)
486{
487    if (!nd->root.mnt) {
488        struct fs_struct *fs = current->fs;
489        read_lock(&fs->lock);
490        nd->root = fs->root;
491        path_get(&nd->root);
492        read_unlock(&fs->lock);
493    }
494}
495
496static int link_path_walk(const char *, struct nameidata *);
497
498static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
499{
500    if (IS_ERR(link))
501        goto fail;
502
503    if (*link == '/') {
504        set_root(nd);
505        path_put(&nd->path);
506        nd->path = nd->root;
507        path_get(&nd->root);
508    }
509
510    return link_path_walk(link, nd);
511fail:
512    path_put(&nd->path);
513    return PTR_ERR(link);
514}
515
516static void path_put_conditional(struct path *path, struct nameidata *nd)
517{
518    dput(path->dentry);
519    if (path->mnt != nd->path.mnt)
520        mntput(path->mnt);
521}
522
523static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
524{
525    dput(nd->path.dentry);
526    if (nd->path.mnt != path->mnt)
527        mntput(nd->path.mnt);
528    nd->path.mnt = path->mnt;
529    nd->path.dentry = path->dentry;
530}
531
532static __always_inline int
533__do_follow_link(struct path *path, struct nameidata *nd, void **p)
534{
535    int error;
536    struct dentry *dentry = path->dentry;
537
538    touch_atime(path->mnt, dentry);
539    nd_set_link(nd, NULL);
540
541    if (path->mnt != nd->path.mnt) {
542        path_to_nameidata(path, nd);
543        dget(dentry);
544    }
545    mntget(path->mnt);
546    nd->last_type = LAST_BIND;
547    *p = dentry->d_inode->i_op->follow_link(dentry, nd);
548    error = PTR_ERR(*p);
549    if (!IS_ERR(*p)) {
550        char *s = nd_get_link(nd);
551        error = 0;
552        if (s)
553            error = __vfs_follow_link(nd, s);
554        else if (nd->last_type == LAST_BIND) {
555            error = force_reval_path(&nd->path, nd);
556            if (error)
557                path_put(&nd->path);
558        }
559    }
560    return error;
561}
562
563/*
564 * This limits recursive symlink follows to 8, while
565 * limiting consecutive symlinks to 40.
566 *
567 * Without that kind of total limit, nasty chains of consecutive
568 * symlinks can cause almost arbitrarily long lookups.
569 */
570static inline int do_follow_link(struct path *path, struct nameidata *nd)
571{
572    void *cookie;
573    int err = -ELOOP;
574    if (current->link_count >= MAX_NESTED_LINKS)
575        goto loop;
576    if (current->total_link_count >= 40)
577        goto loop;
578    BUG_ON(nd->depth >= MAX_NESTED_LINKS);
579    cond_resched();
580    err = security_inode_follow_link(path->dentry, nd);
581    if (err)
582        goto loop;
583    current->link_count++;
584    current->total_link_count++;
585    nd->depth++;
586    err = __do_follow_link(path, nd, &cookie);
587    if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
588        path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
589    path_put(path);
590    current->link_count--;
591    nd->depth--;
592    return err;
593loop:
594    path_put_conditional(path, nd);
595    path_put(&nd->path);
596    return err;
597}
598
599int follow_up(struct path *path)
600{
601    struct vfsmount *parent;
602    struct dentry *mountpoint;
603    spin_lock(&vfsmount_lock);
604    parent = path->mnt->mnt_parent;
605    if (parent == path->mnt) {
606        spin_unlock(&vfsmount_lock);
607        return 0;
608    }
609    mntget(parent);
610    mountpoint = dget(path->mnt->mnt_mountpoint);
611    spin_unlock(&vfsmount_lock);
612    dput(path->dentry);
613    path->dentry = mountpoint;
614    mntput(path->mnt);
615    path->mnt = parent;
616    return 1;
617}
618
619/* no need for dcache_lock, as serialization is taken care in
620 * namespace.c
621 */
622static int __follow_mount(struct path *path)
623{
624    int res = 0;
625    while (d_mountpoint(path->dentry)) {
626        struct vfsmount *mounted = lookup_mnt(path);
627        if (!mounted)
628            break;
629        dput(path->dentry);
630        if (res)
631            mntput(path->mnt);
632        path->mnt = mounted;
633        path->dentry = dget(mounted->mnt_root);
634        res = 1;
635    }
636    return res;
637}
638
639static void follow_mount(struct path *path)
640{
641    while (d_mountpoint(path->dentry)) {
642        struct vfsmount *mounted = lookup_mnt(path);
643        if (!mounted)
644            break;
645        dput(path->dentry);
646        mntput(path->mnt);
647        path->mnt = mounted;
648        path->dentry = dget(mounted->mnt_root);
649    }
650}
651
652/* no need for dcache_lock, as serialization is taken care in
653 * namespace.c
654 */
655int follow_down(struct path *path)
656{
657    struct vfsmount *mounted;
658
659    mounted = lookup_mnt(path);
660    if (mounted) {
661        dput(path->dentry);
662        mntput(path->mnt);
663        path->mnt = mounted;
664        path->dentry = dget(mounted->mnt_root);
665        return 1;
666    }
667    return 0;
668}
669
670static __always_inline void follow_dotdot(struct nameidata *nd)
671{
672    set_root(nd);
673
674    while(1) {
675        struct dentry *old = nd->path.dentry;
676
677        if (nd->path.dentry == nd->root.dentry &&
678            nd->path.mnt == nd->root.mnt) {
679            break;
680        }
681        if (nd->path.dentry != nd->path.mnt->mnt_root) {
682            /* rare case of legitimate dget_parent()... */
683            nd->path.dentry = dget_parent(nd->path.dentry);
684            dput(old);
685            break;
686        }
687        if (!follow_up(&nd->path))
688            break;
689    }
690    follow_mount(&nd->path);
691}
692
693/*
694 * It's more convoluted than I'd like it to be, but... it's still fairly
695 * small and for now I'd prefer to have fast path as straight as possible.
696 * It _is_ time-critical.
697 */
698static int do_lookup(struct nameidata *nd, struct qstr *name,
699             struct path *path)
700{
701    struct vfsmount *mnt = nd->path.mnt;
702    struct dentry *dentry, *parent;
703    struct inode *dir;
704    /*
705     * See if the low-level filesystem might want
706     * to use its own hash..
707     */
708    if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
709        int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
710        if (err < 0)
711            return err;
712    }
713
714    dentry = __d_lookup(nd->path.dentry, name);
715    if (!dentry)
716        goto need_lookup;
717    if (dentry->d_op && dentry->d_op->d_revalidate)
718        goto need_revalidate;
719done:
720    path->mnt = mnt;
721    path->dentry = dentry;
722    __follow_mount(path);
723    return 0;
724
725need_lookup:
726    parent = nd->path.dentry;
727    dir = parent->d_inode;
728
729    mutex_lock(&dir->i_mutex);
730    /*
731     * First re-do the cached lookup just in case it was created
732     * while we waited for the directory semaphore..
733     *
734     * FIXME! This could use version numbering or similar to
735     * avoid unnecessary cache lookups.
736     *
737     * The "dcache_lock" is purely to protect the RCU list walker
738     * from concurrent renames at this point (we mustn't get false
739     * negatives from the RCU list walk here, unlike the optimistic
740     * fast walk).
741     *
742     * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
743     */
744    dentry = d_lookup(parent, name);
745    if (!dentry) {
746        struct dentry *new;
747
748        /* Don't create child dentry for a dead directory. */
749        dentry = ERR_PTR(-ENOENT);
750        if (IS_DEADDIR(dir))
751            goto out_unlock;
752
753        new = d_alloc(parent, name);
754        dentry = ERR_PTR(-ENOMEM);
755        if (new) {
756            dentry = dir->i_op->lookup(dir, new, nd);
757            if (dentry)
758                dput(new);
759            else
760                dentry = new;
761        }
762out_unlock:
763        mutex_unlock(&dir->i_mutex);
764        if (IS_ERR(dentry))
765            goto fail;
766        goto done;
767    }
768
769    /*
770     * Uhhuh! Nasty case: the cache was re-populated while
771     * we waited on the semaphore. Need to revalidate.
772     */
773    mutex_unlock(&dir->i_mutex);
774    if (dentry->d_op && dentry->d_op->d_revalidate) {
775        dentry = do_revalidate(dentry, nd);
776        if (!dentry)
777            dentry = ERR_PTR(-ENOENT);
778    }
779    if (IS_ERR(dentry))
780        goto fail;
781    goto done;
782
783need_revalidate:
784    dentry = do_revalidate(dentry, nd);
785    if (!dentry)
786        goto need_lookup;
787    if (IS_ERR(dentry))
788        goto fail;
789    goto done;
790
791fail:
792    return PTR_ERR(dentry);
793}
794
795/*
796 * This is a temporary kludge to deal with "automount" symlinks; proper
797 * solution is to trigger them on follow_mount(), so that do_lookup()
798 * would DTRT. To be killed before 2.6.34-final.
799 */
800static inline int follow_on_final(struct inode *inode, unsigned lookup_flags)
801{
802    return inode && unlikely(inode->i_op->follow_link) &&
803        ((lookup_flags & LOOKUP_FOLLOW) || S_ISDIR(inode->i_mode));
804}
805
806/*
807 * Name resolution.
808 * This is the basic name resolution function, turning a pathname into
809 * the final dentry. We expect 'base' to be positive and a directory.
810 *
811 * Returns 0 and nd will have valid dentry and mnt on success.
812 * Returns error and drops reference to input namei data on failure.
813 */
814static int link_path_walk(const char *name, struct nameidata *nd)
815{
816    struct path next;
817    struct inode *inode;
818    int err;
819    unsigned int lookup_flags = nd->flags;
820    
821    while (*name=='/')
822        name++;
823    if (!*name)
824        goto return_reval;
825
826    inode = nd->path.dentry->d_inode;
827    if (nd->depth)
828        lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
829
830    /* At this point we know we have a real path component. */
831    for(;;) {
832        unsigned long hash;
833        struct qstr this;
834        unsigned int c;
835
836        nd->flags |= LOOKUP_CONTINUE;
837        err = exec_permission(inode);
838         if (err)
839            break;
840
841        this.name = name;
842        c = *(const unsigned char *)name;
843
844        hash = init_name_hash();
845        do {
846            name++;
847            hash = partial_name_hash(c, hash);
848            c = *(const unsigned char *)name;
849        } while (c && (c != '/'));
850        this.len = name - (const char *) this.name;
851        this.hash = end_name_hash(hash);
852
853        /* remove trailing slashes? */
854        if (!c)
855            goto last_component;
856        while (*++name == '/');
857        if (!*name)
858            goto last_with_slashes;
859
860        /*
861         * "." and ".." are special - ".." especially so because it has
862         * to be able to know about the current root directory and
863         * parent relationships.
864         */
865        if (this.name[0] == '.') switch (this.len) {
866            default:
867                break;
868            case 2:
869                if (this.name[1] != '.')
870                    break;
871                follow_dotdot(nd);
872                inode = nd->path.dentry->d_inode;
873                /* fallthrough */
874            case 1:
875                continue;
876        }
877        /* This does the actual lookups.. */
878        err = do_lookup(nd, &this, &next);
879        if (err)
880            break;
881
882        err = -ENOENT;
883        inode = next.dentry->d_inode;
884        if (!inode)
885            goto out_dput;
886
887        if (inode->i_op->follow_link) {
888            err = do_follow_link(&next, nd);
889            if (err)
890                goto return_err;
891            err = -ENOENT;
892            inode = nd->path.dentry->d_inode;
893            if (!inode)
894                break;
895        } else
896            path_to_nameidata(&next, nd);
897        err = -ENOTDIR;
898        if (!inode->i_op->lookup)
899            break;
900        continue;
901        /* here ends the main loop */
902
903last_with_slashes:
904        lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
905last_component:
906        /* Clear LOOKUP_CONTINUE iff it was previously unset */
907        nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
908        if (lookup_flags & LOOKUP_PARENT)
909            goto lookup_parent;
910        if (this.name[0] == '.') switch (this.len) {
911            default:
912                break;
913            case 2:
914                if (this.name[1] != '.')
915                    break;
916                follow_dotdot(nd);
917                inode = nd->path.dentry->d_inode;
918                /* fallthrough */
919            case 1:
920                goto return_reval;
921        }
922        err = do_lookup(nd, &this, &next);
923        if (err)
924            break;
925        inode = next.dentry->d_inode;
926        if (follow_on_final(inode, lookup_flags)) {
927            err = do_follow_link(&next, nd);
928            if (err)
929                goto return_err;
930            inode = nd->path.dentry->d_inode;
931        } else
932            path_to_nameidata(&next, nd);
933        err = -ENOENT;
934        if (!inode)
935            break;
936        if (lookup_flags & LOOKUP_DIRECTORY) {
937            err = -ENOTDIR;
938            if (!inode->i_op->lookup)
939                break;
940        }
941        goto return_base;
942lookup_parent:
943        nd->last = this;
944        nd->last_type = LAST_NORM;
945        if (this.name[0] != '.')
946            goto return_base;
947        if (this.len == 1)
948            nd->last_type = LAST_DOT;
949        else if (this.len == 2 && this.name[1] == '.')
950            nd->last_type = LAST_DOTDOT;
951        else
952            goto return_base;
953return_reval:
954        /*
955         * We bypassed the ordinary revalidation routines.
956         * We may need to check the cached dentry for staleness.
957         */
958        if (nd->path.dentry && nd->path.dentry->d_sb &&
959            (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
960            err = -ESTALE;
961            /* Note: we do not d_invalidate() */
962            if (!nd->path.dentry->d_op->d_revalidate(
963                    nd->path.dentry, nd))
964                break;
965        }
966return_base:
967        return 0;
968out_dput:
969        path_put_conditional(&next, nd);
970        break;
971    }
972    path_put(&nd->path);
973return_err:
974    return err;
975}
976
977static int path_walk(const char *name, struct nameidata *nd)
978{
979    struct path save = nd->path;
980    int result;
981
982    current->total_link_count = 0;
983
984    /* make sure the stuff we saved doesn't go away */
985    path_get(&save);
986
987    result = link_path_walk(name, nd);
988    if (result == -ESTALE) {
989        /* nd->path had been dropped */
990        current->total_link_count = 0;
991        nd->path = save;
992        path_get(&nd->path);
993        nd->flags |= LOOKUP_REVAL;
994        result = link_path_walk(name, nd);
995    }
996
997    path_put(&save);
998
999    return result;
1000}
1001
1002static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1003{
1004    int retval = 0;
1005    int fput_needed;
1006    struct file *file;
1007
1008    nd->last_type = LAST_ROOT; /* if there are only slashes... */
1009    nd->flags = flags;
1010    nd->depth = 0;
1011    nd->root.mnt = NULL;
1012
1013    if (*name=='/') {
1014        set_root(nd);
1015        nd->path = nd->root;
1016        path_get(&nd->root);
1017    } else if (dfd == AT_FDCWD) {
1018        struct fs_struct *fs = current->fs;
1019        read_lock(&fs->lock);
1020        nd->path = fs->pwd;
1021        path_get(&fs->pwd);
1022        read_unlock(&fs->lock);
1023    } else {
1024        struct dentry *dentry;
1025
1026        file = fget_light(dfd, &fput_needed);
1027        retval = -EBADF;
1028        if (!file)
1029            goto out_fail;
1030
1031        dentry = file->f_path.dentry;
1032
1033        retval = -ENOTDIR;
1034        if (!S_ISDIR(dentry->d_inode->i_mode))
1035            goto fput_fail;
1036
1037        retval = file_permission(file, MAY_EXEC);
1038        if (retval)
1039            goto fput_fail;
1040
1041        nd->path = file->f_path;
1042        path_get(&file->f_path);
1043
1044        fput_light(file, fput_needed);
1045    }
1046    return 0;
1047
1048fput_fail:
1049    fput_light(file, fput_needed);
1050out_fail:
1051    return retval;
1052}
1053
1054/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1055static int do_path_lookup(int dfd, const char *name,
1056                unsigned int flags, struct nameidata *nd)
1057{
1058    int retval = path_init(dfd, name, flags, nd);
1059    if (!retval)
1060        retval = path_walk(name, nd);
1061    if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1062                nd->path.dentry->d_inode))
1063        audit_inode(name, nd->path.dentry);
1064    if (nd->root.mnt) {
1065        path_put(&nd->root);
1066        nd->root.mnt = NULL;
1067    }
1068    return retval;
1069}
1070
1071int path_lookup(const char *name, unsigned int flags,
1072            struct nameidata *nd)
1073{
1074    return do_path_lookup(AT_FDCWD, name, flags, nd);
1075}
1076
1077int kern_path(const char *name, unsigned int flags, struct path *path)
1078{
1079    struct nameidata nd;
1080    int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
1081    if (!res)
1082        *path = nd.path;
1083    return res;
1084}
1085
1086/**
1087 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
1088 * @dentry: pointer to dentry of the base directory
1089 * @mnt: pointer to vfs mount of the base directory
1090 * @name: pointer to file name
1091 * @flags: lookup flags
1092 * @nd: pointer to nameidata
1093 */
1094int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1095            const char *name, unsigned int flags,
1096            struct nameidata *nd)
1097{
1098    int retval;
1099
1100    /* same as do_path_lookup */
1101    nd->last_type = LAST_ROOT;
1102    nd->flags = flags;
1103    nd->depth = 0;
1104
1105    nd->path.dentry = dentry;
1106    nd->path.mnt = mnt;
1107    path_get(&nd->path);
1108    nd->root = nd->path;
1109    path_get(&nd->root);
1110
1111    retval = path_walk(name, nd);
1112    if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1113                nd->path.dentry->d_inode))
1114        audit_inode(name, nd->path.dentry);
1115
1116    path_put(&nd->root);
1117    nd->root.mnt = NULL;
1118
1119    return retval;
1120}
1121
1122static struct dentry *__lookup_hash(struct qstr *name,
1123        struct dentry *base, struct nameidata *nd)
1124{
1125    struct dentry *dentry;
1126    struct inode *inode;
1127    int err;
1128
1129    inode = base->d_inode;
1130
1131    /*
1132     * See if the low-level filesystem might want
1133     * to use its own hash..
1134     */
1135    if (base->d_op && base->d_op->d_hash) {
1136        err = base->d_op->d_hash(base, name);
1137        dentry = ERR_PTR(err);
1138        if (err < 0)
1139            goto out;
1140    }
1141
1142    dentry = __d_lookup(base, name);
1143
1144    /* lockess __d_lookup may fail due to concurrent d_move()
1145     * in some unrelated directory, so try with d_lookup
1146     */
1147    if (!dentry)
1148        dentry = d_lookup(base, name);
1149
1150    if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
1151        dentry = do_revalidate(dentry, nd);
1152
1153    if (!dentry) {
1154        struct dentry *new;
1155
1156        /* Don't create child dentry for a dead directory. */
1157        dentry = ERR_PTR(-ENOENT);
1158        if (IS_DEADDIR(inode))
1159            goto out;
1160
1161        new = d_alloc(base, name);
1162        dentry = ERR_PTR(-ENOMEM);
1163        if (!new)
1164            goto out;
1165        dentry = inode->i_op->lookup(inode, new, nd);
1166        if (!dentry)
1167            dentry = new;
1168        else
1169            dput(new);
1170    }
1171out:
1172    return dentry;
1173}
1174
1175/*
1176 * Restricted form of lookup. Doesn't follow links, single-component only,
1177 * needs parent already locked. Doesn't follow mounts.
1178 * SMP-safe.
1179 */
1180static struct dentry *lookup_hash(struct nameidata *nd)
1181{
1182    int err;
1183
1184    err = exec_permission(nd->path.dentry->d_inode);
1185    if (err)
1186        return ERR_PTR(err);
1187    return __lookup_hash(&nd->last, nd->path.dentry, nd);
1188}
1189
1190static int __lookup_one_len(const char *name, struct qstr *this,
1191        struct dentry *base, int len)
1192{
1193    unsigned long hash;
1194    unsigned int c;
1195
1196    this->name = name;
1197    this->len = len;
1198    if (!len)
1199        return -EACCES;
1200
1201    hash = init_name_hash();
1202    while (len--) {
1203        c = *(const unsigned char *)name++;
1204        if (c == '/' || c == '\0')
1205            return -EACCES;
1206        hash = partial_name_hash(c, hash);
1207    }
1208    this->hash = end_name_hash(hash);
1209    return 0;
1210}
1211
1212/**
1213 * lookup_one_len - filesystem helper to lookup single pathname component
1214 * @name: pathname component to lookup
1215 * @base: base directory to lookup from
1216 * @len: maximum length @len should be interpreted to
1217 *
1218 * Note that this routine is purely a helper for filesystem usage and should
1219 * not be called by generic code. Also note that by using this function the
1220 * nameidata argument is passed to the filesystem methods and a filesystem
1221 * using this helper needs to be prepared for that.
1222 */
1223struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1224{
1225    int err;
1226    struct qstr this;
1227
1228    WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1229
1230    err = __lookup_one_len(name, &this, base, len);
1231    if (err)
1232        return ERR_PTR(err);
1233
1234    err = exec_permission(base->d_inode);
1235    if (err)
1236        return ERR_PTR(err);
1237    return __lookup_hash(&this, base, NULL);
1238}
1239
1240int user_path_at(int dfd, const char __user *name, unsigned flags,
1241         struct path *path)
1242{
1243    struct nameidata nd;
1244    char *tmp = getname(name);
1245    int err = PTR_ERR(tmp);
1246    if (!IS_ERR(tmp)) {
1247
1248        BUG_ON(flags & LOOKUP_PARENT);
1249
1250        err = do_path_lookup(dfd, tmp, flags, &nd);
1251        putname(tmp);
1252        if (!err)
1253            *path = nd.path;
1254    }
1255    return err;
1256}
1257
1258static int user_path_parent(int dfd, const char __user *path,
1259            struct nameidata *nd, char **name)
1260{
1261    char *s = getname(path);
1262    int error;
1263
1264    if (IS_ERR(s))
1265        return PTR_ERR(s);
1266
1267    error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
1268    if (error)
1269        putname(s);
1270    else
1271        *name = s;
1272
1273    return error;
1274}
1275
1276/*
1277 * It's inline, so penalty for filesystems that don't use sticky bit is
1278 * minimal.
1279 */
1280static inline int check_sticky(struct inode *dir, struct inode *inode)
1281{
1282    uid_t fsuid = current_fsuid();
1283
1284    if (!(dir->i_mode & S_ISVTX))
1285        return 0;
1286    if (inode->i_uid == fsuid)
1287        return 0;
1288    if (dir->i_uid == fsuid)
1289        return 0;
1290    return !capable(CAP_FOWNER);
1291}
1292
1293/*
1294 * Check whether we can remove a link victim from directory dir, check
1295 * whether the type of victim is right.
1296 * 1. We can't do it if dir is read-only (done in permission())
1297 * 2. We should have write and exec permissions on dir
1298 * 3. We can't remove anything from append-only dir
1299 * 4. We can't do anything with immutable dir (done in permission())
1300 * 5. If the sticky bit on dir is set we should either
1301 * a. be owner of dir, or
1302 * b. be owner of victim, or
1303 * c. have CAP_FOWNER capability
1304 * 6. If the victim is append-only or immutable we can't do antyhing with
1305 * links pointing to it.
1306 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1307 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1308 * 9. We can't remove a root or mountpoint.
1309 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1310 * nfs_async_unlink().
1311 */
1312static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1313{
1314    int error;
1315
1316    if (!victim->d_inode)
1317        return -ENOENT;
1318
1319    BUG_ON(victim->d_parent->d_inode != dir);
1320    audit_inode_child(victim, dir);
1321
1322    error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1323    if (error)
1324        return error;
1325    if (IS_APPEND(dir))
1326        return -EPERM;
1327    if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1328        IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
1329        return -EPERM;
1330    if (isdir) {
1331        if (!S_ISDIR(victim->d_inode->i_mode))
1332            return -ENOTDIR;
1333        if (IS_ROOT(victim))
1334            return -EBUSY;
1335    } else if (S_ISDIR(victim->d_inode->i_mode))
1336        return -EISDIR;
1337    if (IS_DEADDIR(dir))
1338        return -ENOENT;
1339    if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1340        return -EBUSY;
1341    return 0;
1342}
1343
1344/* Check whether we can create an object with dentry child in directory
1345 * dir.
1346 * 1. We can't do it if child already exists (open has special treatment for
1347 * this case, but since we are inlined it's OK)
1348 * 2. We can't do it if dir is read-only (done in permission())
1349 * 3. We should have write and exec permissions on dir
1350 * 4. We can't do it if dir is immutable (done in permission())
1351 */
1352static inline int may_create(struct inode *dir, struct dentry *child)
1353{
1354    if (child->d_inode)
1355        return -EEXIST;
1356    if (IS_DEADDIR(dir))
1357        return -ENOENT;
1358    return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1359}
1360
1361/*
1362 * p1 and p2 should be directories on the same fs.
1363 */
1364struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1365{
1366    struct dentry *p;
1367
1368    if (p1 == p2) {
1369        mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1370        return NULL;
1371    }
1372
1373    mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1374
1375    p = d_ancestor(p2, p1);
1376    if (p) {
1377        mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
1378        mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
1379        return p;
1380    }
1381
1382    p = d_ancestor(p1, p2);
1383    if (p) {
1384        mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1385        mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1386        return p;
1387    }
1388
1389    mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1390    mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
1391    return NULL;
1392}
1393
1394void unlock_rename(struct dentry *p1, struct dentry *p2)
1395{
1396    mutex_unlock(&p1->d_inode->i_mutex);
1397    if (p1 != p2) {
1398        mutex_unlock(&p2->d_inode->i_mutex);
1399        mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
1400    }
1401}
1402
1403int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1404        struct nameidata *nd)
1405{
1406    int error = may_create(dir, dentry);
1407
1408    if (error)
1409        return error;
1410
1411    if (!dir->i_op->create)
1412        return -EACCES; /* shouldn't it be ENOSYS? */
1413    mode &= S_IALLUGO;
1414    mode |= S_IFREG;
1415    error = security_inode_create(dir, dentry, mode);
1416    if (error)
1417        return error;
1418    error = dir->i_op->create(dir, dentry, mode, nd);
1419    if (!error)
1420        fsnotify_create(dir, dentry);
1421    return error;
1422}
1423
1424int may_open(struct path *path, int acc_mode, int flag)
1425{
1426    struct dentry *dentry = path->dentry;
1427    struct inode *inode = dentry->d_inode;
1428    int error;
1429
1430    if (!inode)
1431        return -ENOENT;
1432
1433    switch (inode->i_mode & S_IFMT) {
1434    case S_IFLNK:
1435        return -ELOOP;
1436    case S_IFDIR:
1437        if (acc_mode & MAY_WRITE)
1438            return -EISDIR;
1439        break;
1440    case S_IFBLK:
1441    case S_IFCHR:
1442        if (path->mnt->mnt_flags & MNT_NODEV)
1443            return -EACCES;
1444        /*FALLTHRU*/
1445    case S_IFIFO:
1446    case S_IFSOCK:
1447        flag &= ~O_TRUNC;
1448        break;
1449    }
1450
1451    error = inode_permission(inode, acc_mode);
1452    if (error)
1453        return error;
1454
1455    /*
1456     * An append-only file must be opened in append mode for writing.
1457     */
1458    if (IS_APPEND(inode)) {
1459        if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
1460            return -EPERM;
1461        if (flag & O_TRUNC)
1462            return -EPERM;
1463    }
1464
1465    /* O_NOATIME can only be set by the owner or superuser */
1466    if (flag & O_NOATIME && !is_owner_or_cap(inode))
1467        return -EPERM;
1468
1469    /*
1470     * Ensure there are no outstanding leases on the file.
1471     */
1472    return break_lease(inode, flag);
1473}
1474
1475static int handle_truncate(struct path *path)
1476{
1477    struct inode *inode = path->dentry->d_inode;
1478    int error = get_write_access(inode);
1479    if (error)
1480        return error;
1481    /*
1482     * Refuse to truncate files with mandatory locks held on them.
1483     */
1484    error = locks_verify_locked(inode);
1485    if (!error)
1486        error = security_path_truncate(path, 0,
1487                       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
1488    if (!error) {
1489        error = do_truncate(path->dentry, 0,
1490                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
1491                    NULL);
1492    }
1493    put_write_access(inode);
1494    return error;
1495}
1496
1497/*
1498 * Be careful about ever adding any more callers of this
1499 * function. Its flags must be in the namei format, not
1500 * what get passed to sys_open().
1501 */
1502static int __open_namei_create(struct nameidata *nd, struct path *path,
1503                int open_flag, int mode)
1504{
1505    int error;
1506    struct dentry *dir = nd->path.dentry;
1507
1508    if (!IS_POSIXACL(dir->d_inode))
1509        mode &= ~current_umask();
1510    error = security_path_mknod(&nd->path, path->dentry, mode, 0);
1511    if (error)
1512        goto out_unlock;
1513    error = vfs_create(dir->d_inode, path->dentry, mode, nd);
1514out_unlock:
1515    mutex_unlock(&dir->d_inode->i_mutex);
1516    dput(nd->path.dentry);
1517    nd->path.dentry = path->dentry;
1518    if (error)
1519        return error;
1520    /* Don't check for write permission, don't truncate */
1521    return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
1522}
1523
1524/*
1525 * Note that while the flag value (low two bits) for sys_open means:
1526 * 00 - read-only
1527 * 01 - write-only
1528 * 10 - read-write
1529 * 11 - special
1530 * it is changed into
1531 * 00 - no permissions needed
1532 * 01 - read-permission
1533 * 10 - write-permission
1534 * 11 - read-write
1535 * for the internal routines (ie open_namei()/follow_link() etc)
1536 * This is more logical, and also allows the 00 "no perm needed"
1537 * to be used for symlinks (where the permissions are checked
1538 * later).
1539 *
1540*/
1541static inline int open_to_namei_flags(int flag)
1542{
1543    if ((flag+1) & O_ACCMODE)
1544        flag++;
1545    return flag;
1546}
1547
1548static int open_will_truncate(int flag, struct inode *inode)
1549{
1550    /*
1551     * We'll never write to the fs underlying
1552     * a device file.
1553     */
1554    if (special_file(inode->i_mode))
1555        return 0;
1556    return (flag & O_TRUNC);
1557}
1558
1559static struct file *finish_open(struct nameidata *nd,
1560                int open_flag, int acc_mode)
1561{
1562    struct file *filp;
1563    int will_truncate;
1564    int error;
1565
1566    will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
1567    if (will_truncate) {
1568        error = mnt_want_write(nd->path.mnt);
1569        if (error)
1570            goto exit;
1571    }
1572    error = may_open(&nd->path, acc_mode, open_flag);
1573    if (error) {
1574        if (will_truncate)
1575            mnt_drop_write(nd->path.mnt);
1576        goto exit;
1577    }
1578    filp = nameidata_to_filp(nd);
1579    if (!IS_ERR(filp)) {
1580        error = ima_file_check(filp, acc_mode);
1581        if (error) {
1582            fput(filp);
1583            filp = ERR_PTR(error);
1584        }
1585    }
1586    if (!IS_ERR(filp)) {
1587        if (will_truncate) {
1588            error = handle_truncate(&nd->path);
1589            if (error) {
1590                fput(filp);
1591                filp = ERR_PTR(error);
1592            }
1593        }
1594    }
1595    /*
1596     * It is now safe to drop the mnt write
1597     * because the filp has had a write taken
1598     * on its behalf.
1599     */
1600    if (will_truncate)
1601        mnt_drop_write(nd->path.mnt);
1602    return filp;
1603
1604exit:
1605    if (!IS_ERR(nd->intent.open.file))
1606        release_open_intent(nd);
1607    path_put(&nd->path);
1608    return ERR_PTR(error);
1609}
1610
1611static struct file *do_last(struct nameidata *nd, struct path *path,
1612                int open_flag, int acc_mode,
1613                int mode, const char *pathname)
1614{
1615    struct dentry *dir = nd->path.dentry;
1616    struct file *filp;
1617    int error = -EISDIR;
1618
1619    switch (nd->last_type) {
1620    case LAST_DOTDOT:
1621        follow_dotdot(nd);
1622        dir = nd->path.dentry;
1623        if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1624            if (!dir->d_op->d_revalidate(dir, nd)) {
1625                error = -ESTALE;
1626                goto exit;
1627            }
1628        }
1629        /* fallthrough */
1630    case LAST_DOT:
1631    case LAST_ROOT:
1632        if (open_flag & O_CREAT)
1633            goto exit;
1634        /* fallthrough */
1635    case LAST_BIND:
1636        audit_inode(pathname, dir);
1637        goto ok;
1638    }
1639
1640    /* trailing slashes? */
1641    if (nd->last.name[nd->last.len]) {
1642        if (open_flag & O_CREAT)
1643            goto exit;
1644        nd->flags |= LOOKUP_DIRECTORY;
1645    }
1646
1647    /* just plain open? */
1648    if (!(open_flag & O_CREAT)) {
1649        error = do_lookup(nd, &nd->last, path);
1650        if (error)
1651            goto exit;
1652        error = -ENOENT;
1653        if (!path->dentry->d_inode)
1654            goto exit_dput;
1655        if (path->dentry->d_inode->i_op->follow_link)
1656            return NULL;
1657        error = -ENOTDIR;
1658        if (nd->flags & LOOKUP_DIRECTORY) {
1659            if (!path->dentry->d_inode->i_op->lookup)
1660                goto exit_dput;
1661        }
1662        path_to_nameidata(path, nd);
1663        audit_inode(pathname, nd->path.dentry);
1664        goto ok;
1665    }
1666
1667    /* OK, it's O_CREAT */
1668    mutex_lock(&dir->d_inode->i_mutex);
1669
1670    path->dentry = lookup_hash(nd);
1671    path->mnt = nd->path.mnt;
1672
1673    error = PTR_ERR(path->dentry);
1674    if (IS_ERR(path->dentry)) {
1675        mutex_unlock(&dir->d_inode->i_mutex);
1676        goto exit;
1677    }
1678
1679    if (IS_ERR(nd->intent.open.file)) {
1680        error = PTR_ERR(nd->intent.open.file);
1681        goto exit_mutex_unlock;
1682    }
1683
1684    /* Negative dentry, just create the file */
1685    if (!path->dentry->d_inode) {
1686        /*
1687         * This write is needed to ensure that a
1688         * ro->rw transition does not occur between
1689         * the time when the file is created and when
1690         * a permanent write count is taken through
1691         * the 'struct file' in nameidata_to_filp().
1692         */
1693        error = mnt_want_write(nd->path.mnt);
1694        if (error)
1695            goto exit_mutex_unlock;
1696        error = __open_namei_create(nd, path, open_flag, mode);
1697        if (error) {
1698            mnt_drop_write(nd->path.mnt);
1699            goto exit;
1700        }
1701        filp = nameidata_to_filp(nd);
1702        mnt_drop_write(nd->path.mnt);
1703        if (!IS_ERR(filp)) {
1704            error = ima_file_check(filp, acc_mode);
1705            if (error) {
1706                fput(filp);
1707                filp = ERR_PTR(error);
1708            }
1709        }
1710        return filp;
1711    }
1712
1713    /*
1714     * It already exists.
1715     */
1716    mutex_unlock(&dir->d_inode->i_mutex);
1717    audit_inode(pathname, path->dentry);
1718
1719    error = -EEXIST;
1720    if (open_flag & O_EXCL)
1721        goto exit_dput;
1722
1723    if (__follow_mount(path)) {
1724        error = -ELOOP;
1725        if (open_flag & O_NOFOLLOW)
1726            goto exit_dput;
1727    }
1728
1729    error = -ENOENT;
1730    if (!path->dentry->d_inode)
1731        goto exit_dput;
1732
1733    if (path->dentry->d_inode->i_op->follow_link)
1734        return NULL;
1735
1736    path_to_nameidata(path, nd);
1737    error = -EISDIR;
1738    if (S_ISDIR(path->dentry->d_inode->i_mode))
1739        goto exit;
1740ok:
1741    filp = finish_open(nd, open_flag, acc_mode);
1742    return filp;
1743
1744exit_mutex_unlock:
1745    mutex_unlock(&dir->d_inode->i_mutex);
1746exit_dput:
1747    path_put_conditional(path, nd);
1748exit:
1749    if (!IS_ERR(nd->intent.open.file))
1750        release_open_intent(nd);
1751    path_put(&nd->path);
1752    return ERR_PTR(error);
1753}
1754
1755/*
1756 * Note that the low bits of the passed in "open_flag"
1757 * are not the same as in the local variable "flag". See
1758 * open_to_namei_flags() for more details.
1759 */
1760struct file *do_filp_open(int dfd, const char *pathname,
1761        int open_flag, int mode, int acc_mode)
1762{
1763    struct file *filp;
1764    struct nameidata nd;
1765    int error;
1766    struct path path;
1767    int count = 0;
1768    int flag = open_to_namei_flags(open_flag);
1769    int force_reval = 0;
1770
1771    if (!(open_flag & O_CREAT))
1772        mode = 0;
1773
1774    /*
1775     * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
1776     * check for O_DSYNC if the need any syncing at all we enforce it's
1777     * always set instead of having to deal with possibly weird behaviour
1778     * for malicious applications setting only __O_SYNC.
1779     */
1780    if (open_flag & __O_SYNC)
1781        open_flag |= O_DSYNC;
1782
1783    if (!acc_mode)
1784        acc_mode = MAY_OPEN | ACC_MODE(open_flag);
1785
1786    /* O_TRUNC implies we need access checks for write permissions */
1787    if (open_flag & O_TRUNC)
1788        acc_mode |= MAY_WRITE;
1789
1790    /* Allow the LSM permission hook to distinguish append
1791       access from general write access. */
1792    if (open_flag & O_APPEND)
1793        acc_mode |= MAY_APPEND;
1794
1795    /* find the parent */
1796reval:
1797    error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
1798    if (error)
1799        return ERR_PTR(error);
1800    if (force_reval)
1801        nd.flags |= LOOKUP_REVAL;
1802
1803    current->total_link_count = 0;
1804    error = link_path_walk(pathname, &nd);
1805    if (error) {
1806        filp = ERR_PTR(error);
1807        goto out;
1808    }
1809    if (unlikely(!audit_dummy_context()) && (open_flag & O_CREAT))
1810        audit_inode(pathname, nd.path.dentry);
1811
1812    /*
1813     * We have the parent and last component.
1814     */
1815
1816    error = -ENFILE;
1817    filp = get_empty_filp();
1818    if (filp == NULL)
1819        goto exit_parent;
1820    nd.intent.open.file = filp;
1821    filp->f_flags = open_flag;
1822    nd.intent.open.flags = flag;
1823    nd.intent.open.create_mode = mode;
1824    nd.flags &= ~LOOKUP_PARENT;
1825    nd.flags |= LOOKUP_OPEN;
1826    if (open_flag & O_CREAT) {
1827        nd.flags |= LOOKUP_CREATE;
1828        if (open_flag & O_EXCL)
1829            nd.flags |= LOOKUP_EXCL;
1830    }
1831    if (open_flag & O_DIRECTORY)
1832        nd.flags |= LOOKUP_DIRECTORY;
1833    filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1834    while (unlikely(!filp)) { /* trailing symlink */
1835        struct path holder;
1836        struct inode *inode = path.dentry->d_inode;
1837        void *cookie;
1838        error = -ELOOP;
1839        /* S_ISDIR part is a temporary automount kludge */
1840        if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
1841            goto exit_dput;
1842        if (count++ == 32)
1843            goto exit_dput;
1844        /*
1845         * This is subtle. Instead of calling do_follow_link() we do
1846         * the thing by hands. The reason is that this way we have zero
1847         * link_count and path_walk() (called from ->follow_link)
1848         * honoring LOOKUP_PARENT. After that we have the parent and
1849         * last component, i.e. we are in the same situation as after
1850         * the first path_walk(). Well, almost - if the last component
1851         * is normal we get its copy stored in nd->last.name and we will
1852         * have to putname() it when we are done. Procfs-like symlinks
1853         * just set LAST_BIND.
1854         */
1855        nd.flags |= LOOKUP_PARENT;
1856        error = security_inode_follow_link(path.dentry, &nd);
1857        if (error)
1858            goto exit_dput;
1859        error = __do_follow_link(&path, &nd, &cookie);
1860        if (unlikely(error)) {
1861            /* nd.path had been dropped */
1862            if (!IS_ERR(cookie) && inode->i_op->put_link)
1863                inode->i_op->put_link(path.dentry, &nd, cookie);
1864            path_put(&path);
1865            release_open_intent(&nd);
1866            filp = ERR_PTR(error);
1867            goto out;
1868        }
1869        holder = path;
1870        nd.flags &= ~LOOKUP_PARENT;
1871        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
1872        if (inode->i_op->put_link)
1873            inode->i_op->put_link(holder.dentry, &nd, cookie);
1874        path_put(&holder);
1875    }
1876out:
1877    if (nd.root.mnt)
1878        path_put(&nd.root);
1879    if (filp == ERR_PTR(-ESTALE) && !force_reval) {
1880        force_reval = 1;
1881        goto reval;
1882    }
1883    return filp;
1884
1885exit_dput:
1886    path_put_conditional(&path, &nd);
1887    if (!IS_ERR(nd.intent.open.file))
1888        release_open_intent(&nd);
1889exit_parent:
1890    path_put(&nd.path);
1891    filp = ERR_PTR(error);
1892    goto out;
1893}
1894
1895/**
1896 * filp_open - open file and return file pointer
1897 *
1898 * @filename: path to open
1899 * @flags: open flags as per the open(2) second argument
1900 * @mode: mode for the new file if O_CREAT is set, else ignored
1901 *
1902 * This is the helper to open a file from kernelspace if you really
1903 * have to. But in generally you should not do this, so please move
1904 * along, nothing to see here..
1905 */
1906struct file *filp_open(const char *filename, int flags, int mode)
1907{
1908    return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
1909}
1910EXPORT_SYMBOL(filp_open);
1911
1912/**
1913 * lookup_create - lookup a dentry, creating it if it doesn't exist
1914 * @nd: nameidata info
1915 * @is_dir: directory flag
1916 *
1917 * Simple function to lookup and return a dentry and create it
1918 * if it doesn't exist. Is SMP-safe.
1919 *
1920 * Returns with nd->path.dentry->d_inode->i_mutex locked.
1921 */
1922struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1923{
1924    struct dentry *dentry = ERR_PTR(-EEXIST);
1925
1926    mutex_lock_nested(&nd->path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
1927    /*
1928     * Yucky last component or no last component at all?
1929     * (foo/., foo/.., /////)
1930     */
1931    if (nd->last_type != LAST_NORM)
1932        goto fail;
1933    nd->flags &= ~LOOKUP_PARENT;
1934    nd->flags |= LOOKUP_CREATE | LOOKUP_EXCL;
1935    nd->intent.open.flags = O_EXCL;
1936
1937    /*
1938     * Do the final lookup.
1939     */
1940    dentry = lookup_hash(nd);
1941    if (IS_ERR(dentry))
1942        goto fail;
1943
1944    if (dentry->d_inode)
1945        goto eexist;
1946    /*
1947     * Special case - lookup gave negative, but... we had foo/bar/
1948     * From the vfs_mknod() POV we just have a negative dentry -
1949     * all is fine. Let's be bastards - you had / on the end, you've
1950     * been asking for (non-existent) directory. -ENOENT for you.
1951     */
1952    if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
1953        dput(dentry);
1954        dentry = ERR_PTR(-ENOENT);
1955    }
1956    return dentry;
1957eexist:
1958    dput(dentry);
1959    dentry = ERR_PTR(-EEXIST);
1960fail:
1961    return dentry;
1962}
1963EXPORT_SYMBOL_GPL(lookup_create);
1964
1965int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1966{
1967    int error = may_create(dir, dentry);
1968
1969    if (error)
1970        return error;
1971
1972    if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1973        return -EPERM;
1974
1975    if (!dir->i_op->mknod)
1976        return -EPERM;
1977
1978    error = devcgroup_inode_mknod(mode, dev);
1979    if (error)
1980        return error;
1981
1982    error = security_inode_mknod(dir, dentry, mode, dev);
1983    if (error)
1984        return error;
1985
1986    error = dir->i_op->mknod(dir, dentry, mode, dev);
1987    if (!error)
1988        fsnotify_create(dir, dentry);
1989    return error;
1990}
1991
1992static int may_mknod(mode_t mode)
1993{
1994    switch (mode & S_IFMT) {
1995    case S_IFREG:
1996    case S_IFCHR:
1997    case S_IFBLK:
1998    case S_IFIFO:
1999    case S_IFSOCK:
2000    case 0: /* zero mode translates to S_IFREG */
2001        return 0;
2002    case S_IFDIR:
2003        return -EPERM;
2004    default:
2005        return -EINVAL;
2006    }
2007}
2008
2009SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
2010        unsigned, dev)
2011{
2012    int error;
2013    char *tmp;
2014    struct dentry *dentry;
2015    struct nameidata nd;
2016
2017    if (S_ISDIR(mode))
2018        return -EPERM;
2019
2020    error = user_path_parent(dfd, filename, &nd, &tmp);
2021    if (error)
2022        return error;
2023
2024    dentry = lookup_create(&nd, 0);
2025    if (IS_ERR(dentry)) {
2026        error = PTR_ERR(dentry);
2027        goto out_unlock;
2028    }
2029    if (!IS_POSIXACL(nd.path.dentry->d_inode))
2030        mode &= ~current_umask();
2031    error = may_mknod(mode);
2032    if (error)
2033        goto out_dput;
2034    error = mnt_want_write(nd.path.mnt);
2035    if (error)
2036        goto out_dput;
2037    error = security_path_mknod(&nd.path, dentry, mode, dev);
2038    if (error)
2039        goto out_drop_write;
2040    switch (mode & S_IFMT) {
2041        case 0: case S_IFREG:
2042            error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
2043            break;
2044        case S_IFCHR: case S_IFBLK:
2045            error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,
2046                    new_decode_dev(dev));
2047            break;
2048        case S_IFIFO: case S_IFSOCK:
2049            error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
2050            break;
2051    }
2052out_drop_write:
2053    mnt_drop_write(nd.path.mnt);
2054out_dput:
2055    dput(dentry);
2056out_unlock:
2057    mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2058    path_put(&nd.path);
2059    putname(tmp);
2060
2061    return error;
2062}
2063
2064SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
2065{
2066    return sys_mknodat(AT_FDCWD, filename, mode, dev);
2067}
2068
2069int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2070{
2071    int error = may_create(dir, dentry);
2072
2073    if (error)
2074        return error;
2075
2076    if (!dir->i_op->mkdir)
2077        return -EPERM;
2078
2079    mode &= (S_IRWXUGO|S_ISVTX);
2080    error = security_inode_mkdir(dir, dentry, mode);
2081    if (error)
2082        return error;
2083
2084    error = dir->i_op->mkdir(dir, dentry, mode);
2085    if (!error)
2086        fsnotify_mkdir(dir, dentry);
2087    return error;
2088}
2089
2090SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
2091{
2092    int error = 0;
2093    char * tmp;
2094    struct dentry *dentry;
2095    struct nameidata nd;
2096
2097    error = user_path_parent(dfd, pathname, &nd, &tmp);
2098    if (error)
2099        goto out_err;
2100
2101    dentry = lookup_create(&nd, 1);
2102    error = PTR_ERR(dentry);
2103    if (IS_ERR(dentry))
2104        goto out_unlock;
2105
2106    if (!IS_POSIXACL(nd.path.dentry->d_inode))
2107        mode &= ~current_umask();
2108    error = mnt_want_write(nd.path.mnt);
2109    if (error)
2110        goto out_dput;
2111    error = security_path_mkdir(&nd.path, dentry, mode);
2112    if (error)
2113        goto out_drop_write;
2114    error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2115out_drop_write:
2116    mnt_drop_write(nd.path.mnt);
2117out_dput:
2118    dput(dentry);
2119out_unlock:
2120    mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2121    path_put(&nd.path);
2122    putname(tmp);
2123out_err:
2124    return error;
2125}
2126
2127SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2128{
2129    return sys_mkdirat(AT_FDCWD, pathname, mode);
2130}
2131
2132/*
2133 * We try to drop the dentry early: we should have
2134 * a usage count of 2 if we're the only user of this
2135 * dentry, and if that is true (possibly after pruning
2136 * the dcache), then we drop the dentry now.
2137 *
2138 * A low-level filesystem can, if it choses, legally
2139 * do a
2140 *
2141 * if (!d_unhashed(dentry))
2142 * return -EBUSY;
2143 *
2144 * if it cannot handle the case of removing a directory
2145 * that is still in use by something else..
2146 */
2147void dentry_unhash(struct dentry *dentry)
2148{
2149    dget(dentry);
2150    shrink_dcache_parent(dentry);
2151    spin_lock(&dcache_lock);
2152    spin_lock(&dentry->d_lock);
2153    if (atomic_read(&dentry->d_count) == 2)
2154        __d_drop(dentry);
2155    spin_unlock(&dentry->d_lock);
2156    spin_unlock(&dcache_lock);
2157}
2158
2159int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2160{
2161    int error = may_delete(dir, dentry, 1);
2162
2163    if (error)
2164        return error;
2165
2166    if (!dir->i_op->rmdir)
2167        return -EPERM;
2168
2169    mutex_lock(&dentry->d_inode->i_mutex);
2170    dentry_unhash(dentry);
2171    if (d_mountpoint(dentry))
2172        error = -EBUSY;
2173    else {
2174        error = security_inode_rmdir(dir, dentry);
2175        if (!error) {
2176            error = dir->i_op->rmdir(dir, dentry);
2177            if (!error)
2178                dentry->d_inode->i_flags |= S_DEAD;
2179        }
2180    }
2181    mutex_unlock(&dentry->d_inode->i_mutex);
2182    if (!error) {
2183        d_delete(dentry);
2184    }
2185    dput(dentry);
2186
2187    return error;
2188}
2189
2190static long do_rmdir(int dfd, const char __user *pathname)
2191{
2192    int error = 0;
2193    char * name;
2194    struct dentry *dentry;
2195    struct nameidata nd;
2196
2197    error = user_path_parent(dfd, pathname, &nd, &name);
2198    if (error)
2199        return error;
2200
2201    switch(nd.last_type) {
2202    case LAST_DOTDOT:
2203        error = -ENOTEMPTY;
2204        goto exit1;
2205    case LAST_DOT:
2206        error = -EINVAL;
2207        goto exit1;
2208    case LAST_ROOT:
2209        error = -EBUSY;
2210        goto exit1;
2211    }
2212
2213    nd.flags &= ~LOOKUP_PARENT;
2214
2215    mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2216    dentry = lookup_hash(&nd);
2217    error = PTR_ERR(dentry);
2218    if (IS_ERR(dentry))
2219        goto exit2;
2220    error = mnt_want_write(nd.path.mnt);
2221    if (error)
2222        goto exit3;
2223    error = security_path_rmdir(&nd.path, dentry);
2224    if (error)
2225        goto exit4;
2226    error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2227exit4:
2228    mnt_drop_write(nd.path.mnt);
2229exit3:
2230    dput(dentry);
2231exit2:
2232    mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2233exit1:
2234    path_put(&nd.path);
2235    putname(name);
2236    return error;
2237}
2238
2239SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
2240{
2241    return do_rmdir(AT_FDCWD, pathname);
2242}
2243
2244int vfs_unlink(struct inode *dir, struct dentry *dentry)
2245{
2246    int error = may_delete(dir, dentry, 0);
2247
2248    if (error)
2249        return error;
2250
2251    if (!dir->i_op->unlink)
2252        return -EPERM;
2253
2254    mutex_lock(&dentry->d_inode->i_mutex);
2255    if (d_mountpoint(dentry))
2256        error = -EBUSY;
2257    else {
2258        error = security_inode_unlink(dir, dentry);
2259        if (!error) {
2260            error = dir->i_op->unlink(dir, dentry);
2261            if (!error)
2262                dentry->d_inode->i_flags |= S_DEAD;
2263        }
2264    }
2265    mutex_unlock(&dentry->d_inode->i_mutex);
2266
2267    /* We don't d_delete() NFS sillyrenamed files--they still exist. */
2268    if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
2269        fsnotify_link_count(dentry->d_inode);
2270        d_delete(dentry);
2271    }
2272
2273    return error;
2274}
2275
2276/*
2277 * Make sure that the actual truncation of the file will occur outside its
2278 * directory's i_mutex. Truncate can take a long time if there is a lot of
2279 * writeout happening, and we don't want to prevent access to the directory
2280 * while waiting on the I/O.
2281 */
2282static long do_unlinkat(int dfd, const char __user *pathname)
2283{
2284    int error;
2285    char *name;
2286    struct dentry *dentry;
2287    struct nameidata nd;
2288    struct inode *inode = NULL;
2289
2290    error = user_path_parent(dfd, pathname, &nd, &name);
2291    if (error)
2292        return error;
2293
2294    error = -EISDIR;
2295    if (nd.last_type != LAST_NORM)
2296        goto exit1;
2297
2298    nd.flags &= ~LOOKUP_PARENT;
2299
2300    mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2301    dentry = lookup_hash(&nd);
2302    error = PTR_ERR(dentry);
2303    if (!IS_ERR(dentry)) {
2304        /* Why not before? Because we want correct error value */
2305        if (nd.last.name[nd.last.len])
2306            goto slashes;
2307        inode = dentry->d_inode;
2308        if (inode)
2309            atomic_inc(&inode->i_count);
2310        error = mnt_want_write(nd.path.mnt);
2311        if (error)
2312            goto exit2;
2313        error = security_path_unlink(&nd.path, dentry);
2314        if (error)
2315            goto exit3;
2316        error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2317exit3:
2318        mnt_drop_write(nd.path.mnt);
2319    exit2:
2320        dput(dentry);
2321    }
2322    mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2323    if (inode)
2324        iput(inode); /* truncate the inode here */
2325exit1:
2326    path_put(&nd.path);
2327    putname(name);
2328    return error;
2329
2330slashes:
2331    error = !dentry->d_inode ? -ENOENT :
2332        S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
2333    goto exit2;
2334}
2335
2336SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
2337{
2338    if ((flag & ~AT_REMOVEDIR) != 0)
2339        return -EINVAL;
2340
2341    if (flag & AT_REMOVEDIR)
2342        return do_rmdir(dfd, pathname);
2343
2344    return do_unlinkat(dfd, pathname);
2345}
2346
2347SYSCALL_DEFINE1(unlink, const char __user *, pathname)
2348{
2349    return do_unlinkat(AT_FDCWD, pathname);
2350}
2351
2352int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
2353{
2354    int error = may_create(dir, dentry);
2355
2356    if (error)
2357        return error;
2358
2359    if (!dir->i_op->symlink)
2360        return -EPERM;
2361
2362    error = security_inode_symlink(dir, dentry, oldname);
2363    if (error)
2364        return error;
2365
2366    error = dir->i_op->symlink(dir, dentry, oldname);
2367    if (!error)
2368        fsnotify_create(dir, dentry);
2369    return error;
2370}
2371
2372SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
2373        int, newdfd, const char __user *, newname)
2374{
2375    int error;
2376    char *from;
2377    char *to;
2378    struct dentry *dentry;
2379    struct nameidata nd;
2380
2381    from = getname(oldname);
2382    if (IS_ERR(from))
2383        return PTR_ERR(from);
2384
2385    error = user_path_parent(newdfd, newname, &nd, &to);
2386    if (error)
2387        goto out_putname;
2388
2389    dentry = lookup_create(&nd, 0);
2390    error = PTR_ERR(dentry);
2391    if (IS_ERR(dentry))
2392        goto out_unlock;
2393
2394    error = mnt_want_write(nd.path.mnt);
2395    if (error)
2396        goto out_dput;
2397    error = security_path_symlink(&nd.path, dentry, from);
2398    if (error)
2399        goto out_drop_write;
2400    error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
2401out_drop_write:
2402    mnt_drop_write(nd.path.mnt);
2403out_dput:
2404    dput(dentry);
2405out_unlock:
2406    mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2407    path_put(&nd.path);
2408    putname(to);
2409out_putname:
2410    putname(from);
2411    return error;
2412}
2413
2414SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
2415{
2416    return sys_symlinkat(oldname, AT_FDCWD, newname);
2417}
2418
2419int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
2420{
2421    struct inode *inode = old_dentry->d_inode;
2422    int error;
2423
2424    if (!inode)
2425        return -ENOENT;
2426
2427    error = may_create(dir, new_dentry);
2428    if (error)
2429        return error;
2430
2431    if (dir->i_sb != inode->i_sb)
2432        return -EXDEV;
2433
2434    /*
2435     * A link to an append-only or immutable file cannot be created.
2436     */
2437    if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2438        return -EPERM;
2439    if (!dir->i_op->link)
2440        return -EPERM;
2441    if (S_ISDIR(inode->i_mode))
2442        return -EPERM;
2443
2444    error = security_inode_link(old_dentry, dir, new_dentry);
2445    if (error)
2446        return error;
2447
2448    mutex_lock(&inode->i_mutex);
2449    error = dir->i_op->link(old_dentry, dir, new_dentry);
2450    mutex_unlock(&inode->i_mutex);
2451    if (!error)
2452        fsnotify_link(dir, inode, new_dentry);
2453    return error;
2454}
2455
2456/*
2457 * Hardlinks are often used in delicate situations. We avoid
2458 * security-related surprises by not following symlinks on the
2459 * newname. --KAB
2460 *
2461 * We don't follow them on the oldname either to be compatible
2462 * with linux 2.0, and to avoid hard-linking to directories
2463 * and other special files. --ADM
2464 */
2465SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
2466        int, newdfd, const char __user *, newname, int, flags)
2467{
2468    struct dentry *new_dentry;
2469    struct nameidata nd;
2470    struct path old_path;
2471    int error;
2472    char *to;
2473
2474    if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
2475        return -EINVAL;
2476
2477    error = user_path_at(olddfd, oldname,
2478                 flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
2479                 &old_path);
2480    if (error)
2481        return error;
2482
2483    error = user_path_parent(newdfd, newname, &nd, &to);
2484    if (error)
2485        goto out;
2486    error = -EXDEV;
2487    if (old_path.mnt != nd.path.mnt)
2488        goto out_release;
2489    new_dentry = lookup_create(&nd, 0);
2490    error = PTR_ERR(new_dentry);
2491    if (IS_ERR(new_dentry))
2492        goto out_unlock;
2493    error = mnt_want_write(nd.path.mnt);
2494    if (error)
2495        goto out_dput;
2496    error = security_path_link(old_path.dentry, &nd.path, new_dentry);
2497    if (error)
2498        goto out_drop_write;
2499    error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
2500out_drop_write:
2501    mnt_drop_write(nd.path.mnt);
2502out_dput:
2503    dput(new_dentry);
2504out_unlock:
2505    mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2506out_release:
2507    path_put(&nd.path);
2508    putname(to);
2509out:
2510    path_put(&old_path);
2511
2512    return error;
2513}
2514
2515SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
2516{
2517    return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
2518}
2519
2520/*
2521 * The worst of all namespace operations - renaming directory. "Perverted"
2522 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
2523 * Problems:
2524 * a) we can get into loop creation. Check is done in is_subdir().
2525 * b) race potential - two innocent renames can create a loop together.
2526 * That's where 4.4 screws up. Current fix: serialization on
2527 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another
2528 * story.
2529 * c) we have to lock _three_ objects - parents and victim (if it exists).
2530 * And that - after we got ->i_mutex on parents (until then we don't know
2531 * whether the target exists). Solution: try to be smart with locking
2532 * order for inodes. We rely on the fact that tree topology may change
2533 * only under ->s_vfs_rename_mutex _and_ that parent of the object we
2534 * move will be locked. Thus we can rank directories by the tree
2535 * (ancestors first) and rank all non-directories after them.
2536 * That works since everybody except rename does "lock parent, lookup,
2537 * lock child" and rename is under ->s_vfs_rename_mutex.
2538 * HOWEVER, it relies on the assumption that any object with ->lookup()
2539 * has no more than 1 dentry. If "hybrid" objects will ever appear,
2540 * we'd better make sure that there's no link(2) for them.
2541 * d) some filesystems don't support opened-but-unlinked directories,
2542 * either because of layout or because they are not ready to deal with
2543 * all cases correctly. The latter will be fixed (taking this sort of
2544 * stuff into VFS), but the former is not going away. Solution: the same
2545 * trick as in rmdir().
2546 * e) conversion from fhandle to dentry may come in the wrong moment - when
2547 * we are removing the target. Solution: we will have to grab ->i_mutex
2548 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
2549 * ->i_mutex on parents, which works but leads to some truly excessive
2550 * locking].
2551 */
2552static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2553              struct inode *new_dir, struct dentry *new_dentry)
2554{
2555    int error = 0;
2556    struct inode *target;
2557
2558    /*
2559     * If we are going to change the parent - check write permissions,
2560     * we'll need to flip '..'.
2561     */
2562    if (new_dir != old_dir) {
2563        error = inode_permission(old_dentry->d_inode, MAY_WRITE);
2564        if (error)
2565            return error;
2566    }
2567
2568    error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2569    if (error)
2570        return error;
2571
2572    target = new_dentry->d_inode;
2573    if (target) {
2574        mutex_lock(&target->i_mutex);
2575        dentry_unhash(new_dentry);
2576    }
2577    if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2578        error = -EBUSY;
2579    else
2580        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2581    if (target) {
2582        if (!error)
2583            target->i_flags |= S_DEAD;
2584        mutex_unlock(&target->i_mutex);
2585        if (d_unhashed(new_dentry))
2586            d_rehash(new_dentry);
2587        dput(new_dentry);
2588    }
2589    if (!error)
2590        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2591            d_move(old_dentry,new_dentry);
2592    return error;
2593}
2594
2595static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
2596                struct inode *new_dir, struct dentry *new_dentry)
2597{
2598    struct inode *target;
2599    int error;
2600
2601    error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
2602    if (error)
2603        return error;
2604
2605    dget(new_dentry);
2606    target = new_dentry->d_inode;
2607    if (target)
2608        mutex_lock(&target->i_mutex);
2609    if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
2610        error = -EBUSY;
2611    else
2612        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2613    if (!error) {
2614        if (target)
2615            target->i_flags |= S_DEAD;
2616        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
2617            d_move(old_dentry, new_dentry);
2618    }
2619    if (target)
2620        mutex_unlock(&target->i_mutex);
2621    dput(new_dentry);
2622    return error;
2623}
2624
2625int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
2626           struct inode *new_dir, struct dentry *new_dentry)
2627{
2628    int error;
2629    int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
2630    const char *old_name;
2631
2632    if (old_dentry->d_inode == new_dentry->d_inode)
2633         return 0;
2634 
2635    error = may_delete(old_dir, old_dentry, is_dir);
2636    if (error)
2637        return error;
2638
2639    if (!new_dentry->d_inode)
2640        error = may_create(new_dir, new_dentry);
2641    else
2642        error = may_delete(new_dir, new_dentry, is_dir);
2643    if (error)
2644        return error;
2645
2646    if (!old_dir->i_op->rename)
2647        return -EPERM;
2648
2649    old_name = fsnotify_oldname_init(old_dentry->d_name.name);
2650
2651    if (is_dir)
2652        error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
2653    else
2654        error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
2655    if (!error)
2656        fsnotify_move(old_dir, new_dir, old_name, is_dir,
2657                  new_dentry->d_inode, old_dentry);
2658    fsnotify_oldname_free(old_name);
2659
2660    return error;
2661}
2662
2663SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
2664        int, newdfd, const char __user *, newname)
2665{
2666    struct dentry *old_dir, *new_dir;
2667    struct dentry *old_dentry, *new_dentry;
2668    struct dentry *trap;
2669    struct nameidata oldnd, newnd;
2670    char *from;
2671    char *to;
2672    int error;
2673
2674    error = user_path_parent(olddfd, oldname, &oldnd, &from);
2675    if (error)
2676        goto exit;
2677
2678    error = user_path_parent(newdfd, newname, &newnd, &to);
2679    if (error)
2680        goto exit1;
2681
2682    error = -EXDEV;
2683    if (oldnd.path.mnt != newnd.path.mnt)
2684        goto exit2;
2685
2686    old_dir = oldnd.path.dentry;
2687    error = -EBUSY;
2688    if (oldnd.last_type != LAST_NORM)
2689        goto exit2;
2690
2691    new_dir = newnd.path.dentry;
2692    if (newnd.last_type != LAST_NORM)
2693        goto exit2;
2694
2695    oldnd.flags &= ~LOOKUP_PARENT;
2696    newnd.flags &= ~LOOKUP_PARENT;
2697    newnd.flags |= LOOKUP_RENAME_TARGET;
2698
2699    trap = lock_rename(new_dir, old_dir);
2700
2701    old_dentry = lookup_hash(&oldnd);
2702    error = PTR_ERR(old_dentry);
2703    if (IS_ERR(old_dentry))
2704        goto exit3;
2705    /* source must exist */
2706    error = -ENOENT;
2707    if (!old_dentry->d_inode)
2708        goto exit4;
2709    /* unless the source is a directory trailing slashes give -ENOTDIR */
2710    if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
2711        error = -ENOTDIR;
2712        if (oldnd.last.name[oldnd.last.len])
2713            goto exit4;
2714        if (newnd.last.name[newnd.last.len])
2715            goto exit4;
2716    }
2717    /* source should not be ancestor of target */
2718    error = -EINVAL;
2719    if (old_dentry == trap)
2720        goto exit4;
2721    new_dentry = lookup_hash(&newnd);
2722    error = PTR_ERR(new_dentry);
2723    if (IS_ERR(new_dentry))
2724        goto exit4;
2725    /* target should not be an ancestor of source */
2726    error = -ENOTEMPTY;
2727    if (new_dentry == trap)
2728        goto exit5;
2729
2730    error = mnt_want_write(oldnd.path.mnt);
2731    if (error)
2732        goto exit5;
2733    error = security_path_rename(&oldnd.path, old_dentry,
2734                     &newnd.path, new_dentry);
2735    if (error)
2736        goto exit6;
2737    error = vfs_rename(old_dir->d_inode, old_dentry,
2738                   new_dir->d_inode, new_dentry);
2739exit6:
2740    mnt_drop_write(oldnd.path.mnt);
2741exit5:
2742    dput(new_dentry);
2743exit4:
2744    dput(old_dentry);
2745exit3:
2746    unlock_rename(new_dir, old_dir);
2747exit2:
2748    path_put(&newnd.path);
2749    putname(to);
2750exit1:
2751    path_put(&oldnd.path);
2752    putname(from);
2753exit:
2754    return error;
2755}
2756
2757SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
2758{
2759    return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
2760}
2761
2762int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
2763{
2764    int len;
2765
2766    len = PTR_ERR(link);
2767    if (IS_ERR(link))
2768        goto out;
2769
2770    len = strlen(link);
2771    if (len > (unsigned) buflen)
2772        len = buflen;
2773    if (copy_to_user(buffer, link, len))
2774        len = -EFAULT;
2775out:
2776    return len;
2777}
2778
2779/*
2780 * A helper for ->readlink(). This should be used *ONLY* for symlinks that
2781 * have ->follow_link() touching nd only in nd_set_link(). Using (or not
2782 * using) it for any given inode is up to filesystem.
2783 */
2784int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2785{
2786    struct nameidata nd;
2787    void *cookie;
2788    int res;
2789
2790    nd.depth = 0;
2791    cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
2792    if (IS_ERR(cookie))
2793        return PTR_ERR(cookie);
2794
2795    res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
2796    if (dentry->d_inode->i_op->put_link)
2797        dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
2798    return res;
2799}
2800
2801int vfs_follow_link(struct nameidata *nd, const char *link)
2802{
2803    return __vfs_follow_link(nd, link);
2804}
2805
2806/* get the link contents into pagecache */
2807static char *page_getlink(struct dentry * dentry, struct page **ppage)
2808{
2809    char *kaddr;
2810    struct page *page;
2811    struct address_space *mapping = dentry->d_inode->i_mapping;
2812    page = read_mapping_page(mapping, 0, NULL);
2813    if (IS_ERR(page))
2814        return (char*)page;
2815    *ppage = page;
2816    kaddr = kmap(page);
2817    nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
2818    return kaddr;
2819}
2820
2821int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
2822{
2823    struct page *page = NULL;
2824    char *s = page_getlink(dentry, &page);
2825    int res = vfs_readlink(dentry,buffer,buflen,s);
2826    if (page) {
2827        kunmap(page);
2828        page_cache_release(page);
2829    }
2830    return res;
2831}
2832
2833void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
2834{
2835    struct page *page = NULL;
2836    nd_set_link(nd, page_getlink(dentry, &page));
2837    return page;
2838}
2839
2840void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
2841{
2842    struct page *page = cookie;
2843
2844    if (page) {
2845        kunmap(page);
2846        page_cache_release(page);
2847    }
2848}
2849
2850/*
2851 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
2852 */
2853int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
2854{
2855    struct address_space *mapping = inode->i_mapping;
2856    struct page *page;
2857    void *fsdata;
2858    int err;
2859    char *kaddr;
2860    unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
2861    if (nofs)
2862        flags |= AOP_FLAG_NOFS;
2863
2864retry:
2865    err = pagecache_write_begin(NULL, mapping, 0, len-1,
2866                flags, &page, &fsdata);
2867    if (err)
2868        goto fail;
2869
2870    kaddr = kmap_atomic(page, KM_USER0);
2871    memcpy(kaddr, symname, len-1);
2872    kunmap_atomic(kaddr, KM_USER0);
2873
2874    err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
2875                            page, fsdata);
2876    if (err < 0)
2877        goto fail;
2878    if (err < len-1)
2879        goto retry;
2880
2881    mark_inode_dirty(inode);
2882    return 0;
2883fail:
2884    return err;
2885}
2886
2887int page_symlink(struct inode *inode, const char *symname, int len)
2888{
2889    return __page_symlink(inode, symname, len,
2890            !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
2891}
2892
2893const struct inode_operations page_symlink_inode_operations = {
2894    .readlink = generic_readlink,
2895    .follow_link = page_follow_link_light,
2896    .put_link = page_put_link,
2897};
2898
2899EXPORT_SYMBOL(user_path_at);
2900EXPORT_SYMBOL(follow_down);
2901EXPORT_SYMBOL(follow_up);
2902EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
2903EXPORT_SYMBOL(getname);
2904EXPORT_SYMBOL(lock_rename);
2905EXPORT_SYMBOL(lookup_one_len);
2906EXPORT_SYMBOL(page_follow_link_light);
2907EXPORT_SYMBOL(page_put_link);
2908EXPORT_SYMBOL(page_readlink);
2909EXPORT_SYMBOL(__page_symlink);
2910EXPORT_SYMBOL(page_symlink);
2911EXPORT_SYMBOL(page_symlink_inode_operations);
2912EXPORT_SYMBOL(path_lookup);
2913EXPORT_SYMBOL(kern_path);
2914EXPORT_SYMBOL(vfs_path_lookup);
2915EXPORT_SYMBOL(inode_permission);
2916EXPORT_SYMBOL(file_permission);
2917EXPORT_SYMBOL(unlock_rename);
2918EXPORT_SYMBOL(vfs_create);
2919EXPORT_SYMBOL(vfs_follow_link);
2920EXPORT_SYMBOL(vfs_link);
2921EXPORT_SYMBOL(vfs_mkdir);
2922EXPORT_SYMBOL(vfs_mknod);
2923EXPORT_SYMBOL(generic_permission);
2924EXPORT_SYMBOL(vfs_readlink);
2925EXPORT_SYMBOL(vfs_rename);
2926EXPORT_SYMBOL(vfs_rmdir);
2927EXPORT_SYMBOL(vfs_symlink);
2928EXPORT_SYMBOL(vfs_unlink);
2929EXPORT_SYMBOL(dentry_unhash);
2930EXPORT_SYMBOL(generic_readlink);
2931

Archive Download this file



interactive