Root/target/linux/generic/patches-3.0/100-overlayfs_v10.patch

1--- a/fs/open.c
2+++ b/fs/open.c
3@@ -666,8 +666,7 @@ static inline int __get_file_write_acces
4     return error;
5 }
6 
7-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
8- struct file *f,
9+static struct file *__dentry_open(struct path *path, struct file *f,
10                     int (*open)(struct inode *, struct file *),
11                     const struct cred *cred)
12 {
13@@ -675,15 +674,16 @@ static struct file *__dentry_open(struct
14     struct inode *inode;
15     int error;
16 
17+ path_get(path);
18     f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
19                 FMODE_PREAD | FMODE_PWRITE;
20 
21     if (unlikely(f->f_flags & O_PATH))
22         f->f_mode = FMODE_PATH;
23 
24- inode = dentry->d_inode;
25+ inode = path->dentry->d_inode;
26     if (f->f_mode & FMODE_WRITE) {
27- error = __get_file_write_access(inode, mnt);
28+ error = __get_file_write_access(inode, path->mnt);
29         if (error)
30             goto cleanup_file;
31         if (!special_file(inode->i_mode))
32@@ -691,8 +691,7 @@ static struct file *__dentry_open(struct
33     }
34 
35     f->f_mapping = inode->i_mapping;
36- f->f_path.dentry = dentry;
37- f->f_path.mnt = mnt;
38+ f->f_path = *path;
39     f->f_pos = 0;
40     file_sb_list_add(f, inode->i_sb);
41 
42@@ -745,7 +744,7 @@ cleanup_all:
43              * here, so just reset the state.
44              */
45             file_reset_write(f);
46- mnt_drop_write(mnt);
47+ mnt_drop_write(path->mnt);
48         }
49     }
50     file_sb_list_del(f);
51@@ -753,8 +752,7 @@ cleanup_all:
52     f->f_path.mnt = NULL;
53 cleanup_file:
54     put_filp(f);
55- dput(dentry);
56- mntput(mnt);
57+ path_put(path);
58     return ERR_PTR(error);
59 }
60 
61@@ -780,14 +778,14 @@ cleanup_file:
62 struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
63         int (*open)(struct inode *, struct file *))
64 {
65+ struct path path = { .dentry = dentry, .mnt = nd->path.mnt };
66     const struct cred *cred = current_cred();
67 
68     if (IS_ERR(nd->intent.open.file))
69         goto out;
70     if (IS_ERR(dentry))
71         goto out_err;
72- nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
73- nd->intent.open.file,
74+ nd->intent.open.file = __dentry_open(&path, nd->intent.open.file,
75                          open, cred);
76 out:
77     return nd->intent.open.file;
78@@ -816,10 +814,17 @@ struct file *nameidata_to_filp(struct na
79 
80     /* Has the filesystem initialised the file for us? */
81     if (filp->f_path.dentry == NULL) {
82- path_get(&nd->path);
83- filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
84- NULL, cred);
85+ struct inode *inode = nd->path.dentry->d_inode;
86+
87+ if (inode->i_op->open) {
88+ int flags = filp->f_flags;
89+ put_filp(filp);
90+ filp = inode->i_op->open(nd->path.dentry, flags, cred);
91+ } else {
92+ filp = __dentry_open(&nd->path, filp, NULL, cred);
93+ }
94     }
95+
96     return filp;
97 }
98 
99@@ -830,26 +835,45 @@ struct file *nameidata_to_filp(struct na
100 struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
101              const struct cred *cred)
102 {
103- int error;
104- struct file *f;
105-
106- validate_creds(cred);
107+ struct path path = { .dentry = dentry, .mnt = mnt };
108+ struct file *ret;
109 
110     /* We must always pass in a valid mount pointer. */
111     BUG_ON(!mnt);
112 
113- error = -ENFILE;
114+ ret = vfs_open(&path, flags, cred);
115+ path_put(&path);
116+
117+ return ret;
118+}
119+EXPORT_SYMBOL(dentry_open);
120+
121+/**
122+ * vfs_open - open the file at the given path
123+ * @path: path to open
124+ * @flags: open flags
125+ * @cred: credentials to use
126+ *
127+ * Open the file. If successful, the returned file will have acquired
128+ * an additional reference for path.
129+ */
130+struct file *vfs_open(struct path *path, int flags, const struct cred *cred)
131+{
132+ struct file *f;
133+ struct inode *inode = path->dentry->d_inode;
134+
135+ validate_creds(cred);
136+
137+ if (inode->i_op->open)
138+ return inode->i_op->open(path->dentry, flags, cred);
139     f = get_empty_filp();
140- if (f == NULL) {
141- dput(dentry);
142- mntput(mnt);
143- return ERR_PTR(error);
144- }
145+ if (f == NULL)
146+ return ERR_PTR(-ENFILE);
147 
148     f->f_flags = flags;
149- return __dentry_open(dentry, mnt, f, NULL, cred);
150+ return __dentry_open(path, f, NULL, cred);
151 }
152-EXPORT_SYMBOL(dentry_open);
153+EXPORT_SYMBOL(vfs_open);
154 
155 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
156 {
157--- a/include/linux/fs.h
158+++ b/include/linux/fs.h
159@@ -1603,6 +1603,7 @@ struct inode_operations {
160     void (*truncate_range)(struct inode *, loff_t, loff_t);
161     int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
162               u64 len);
163+ struct file *(*open)(struct dentry *, int flags, const struct cred *);
164 } ____cacheline_aligned;
165 
166 struct seq_file;
167@@ -1997,6 +1998,7 @@ extern long do_sys_open(int dfd, const c
168 extern struct file *filp_open(const char *, int, int);
169 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
170                    const char *, int);
171+extern struct file *vfs_open(struct path *, int flags, const struct cred *);
172 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
173                  const struct cred *);
174 extern int filp_close(struct file *, fl_owner_t id);
175--- a/fs/splice.c
176+++ b/fs/splice.c
177@@ -1300,6 +1300,7 @@ long do_splice_direct(struct file *in, l
178 
179     return ret;
180 }
181+EXPORT_SYMBOL(do_splice_direct);
182 
183 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
184                    struct pipe_inode_info *opipe,
185--- a/fs/namespace.c
186+++ b/fs/namespace.c
187@@ -1494,6 +1494,23 @@ void drop_collected_mounts(struct vfsmou
188     release_mounts(&umount_list);
189 }
190 
191+struct vfsmount *clone_private_mount(struct path *path)
192+{
193+ struct vfsmount *mnt;
194+
195+ if (IS_MNT_UNBINDABLE(path->mnt))
196+ return ERR_PTR(-EINVAL);
197+
198+ down_read(&namespace_sem);
199+ mnt = clone_mnt(path->mnt, path->dentry, CL_PRIVATE);
200+ up_read(&namespace_sem);
201+ if (!mnt)
202+ return ERR_PTR(-ENOMEM);
203+
204+ return mnt;
205+}
206+EXPORT_SYMBOL_GPL(clone_private_mount);
207+
208 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
209            struct vfsmount *root)
210 {
211--- a/include/linux/mount.h
212+++ b/include/linux/mount.h
213@@ -100,6 +100,9 @@ extern void mnt_pin(struct vfsmount *mnt
214 extern void mnt_unpin(struct vfsmount *mnt);
215 extern int __mnt_is_readonly(struct vfsmount *mnt);
216 
217+struct path;
218+extern struct vfsmount *clone_private_mount(struct path *path);
219+
220 extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
221                       const char *name, void *data);
222 
223--- a/fs/Kconfig
224+++ b/fs/Kconfig
225@@ -63,6 +63,7 @@ source "fs/quota/Kconfig"
226 
227 source "fs/autofs4/Kconfig"
228 source "fs/fuse/Kconfig"
229+source "fs/overlayfs/Kconfig"
230 
231 config CUSE
232     tristate "Character device in Userspace support"
233--- a/fs/Makefile
234+++ b/fs/Makefile
235@@ -105,6 +105,7 @@ obj-$(CONFIG_QNX4FS_FS) += qnx4/
236 obj-$(CONFIG_AUTOFS4_FS) += autofs4/
237 obj-$(CONFIG_ADFS_FS) += adfs/
238 obj-$(CONFIG_FUSE_FS) += fuse/
239+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
240 obj-$(CONFIG_UDF_FS) += udf/
241 obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
242 obj-$(CONFIG_OMFS_FS) += omfs/
243--- /dev/null
244+++ b/fs/overlayfs/Kconfig
245@@ -0,0 +1,4 @@
246+config OVERLAYFS_FS
247+ tristate "Overlay filesystem support"
248+ help
249+ Add support for overlay filesystem.
250--- /dev/null
251+++ b/fs/overlayfs/Makefile
252@@ -0,0 +1,7 @@
253+#
254+# Makefile for the overlay filesystem.
255+#
256+
257+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
258+
259+overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
260--- /dev/null
261+++ b/fs/overlayfs/copy_up.c
262@@ -0,0 +1,383 @@
263+/*
264+ *
265+ * Copyright (C) 2011 Novell Inc.
266+ *
267+ * This program is free software; you can redistribute it and/or modify it
268+ * under the terms of the GNU General Public License version 2 as published by
269+ * the Free Software Foundation.
270+ */
271+
272+#include <linux/fs.h>
273+#include <linux/slab.h>
274+#include <linux/file.h>
275+#include <linux/splice.h>
276+#include <linux/xattr.h>
277+#include <linux/security.h>
278+#include <linux/uaccess.h>
279+#include "overlayfs.h"
280+
281+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
282+
283+static int ovl_copy_up_xattr(struct dentry *old, struct dentry *new)
284+{
285+ ssize_t list_size, size;
286+ char *buf, *name, *value;
287+ int error;
288+
289+ if (!old->d_inode->i_op->getxattr ||
290+ !new->d_inode->i_op->getxattr)
291+ return 0;
292+
293+ list_size = vfs_listxattr(old, NULL, 0);
294+ if (list_size <= 0) {
295+ if (list_size == -EOPNOTSUPP)
296+ return 0;
297+ return list_size;
298+ }
299+
300+ buf = kzalloc(list_size, GFP_KERNEL);
301+ if (!buf)
302+ return -ENOMEM;
303+
304+ error = -ENOMEM;
305+ value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
306+ if (!value)
307+ goto out;
308+
309+ list_size = vfs_listxattr(old, buf, list_size);
310+ if (list_size <= 0) {
311+ error = list_size;
312+ goto out_free_value;
313+ }
314+
315+ for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
316+ size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
317+ if (size <= 0) {
318+ error = size;
319+ goto out_free_value;
320+ }
321+ error = vfs_setxattr(new, name, value, size, 0);
322+ if (error)
323+ goto out_free_value;
324+ }
325+
326+out_free_value:
327+ kfree(value);
328+out:
329+ kfree(buf);
330+ return error;
331+}
332+
333+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
334+{
335+ struct file *old_file;
336+ struct file *new_file;
337+ int error = 0;
338+
339+ if (len == 0)
340+ return 0;
341+
342+ old_file = vfs_open(old, O_RDONLY, current_cred());
343+ if (IS_ERR(old_file))
344+ return PTR_ERR(old_file);
345+
346+ new_file = vfs_open(new, O_WRONLY, current_cred());
347+ if (IS_ERR(new_file)) {
348+ error = PTR_ERR(new_file);
349+ goto out_fput;
350+ }
351+
352+ /* FIXME: copy up sparse files efficiently */
353+ while (len) {
354+ loff_t offset = new_file->f_pos;
355+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
356+ long bytes;
357+
358+ if (len < this_len)
359+ this_len = len;
360+
361+ if (signal_pending_state(TASK_KILLABLE, current)) {
362+ error = -EINTR;
363+ break;
364+ }
365+
366+ bytes = do_splice_direct(old_file, &offset, new_file, this_len,
367+ SPLICE_F_MOVE);
368+ if (bytes <= 0) {
369+ error = bytes;
370+ break;
371+ }
372+
373+ len -= bytes;
374+ }
375+
376+ fput(new_file);
377+out_fput:
378+ fput(old_file);
379+ return error;
380+}
381+
382+static char *ovl_read_symlink(struct dentry *realdentry)
383+{
384+ int res;
385+ char *buf;
386+ struct inode *inode = realdentry->d_inode;
387+ mm_segment_t old_fs;
388+
389+ res = -EINVAL;
390+ if (!inode->i_op->readlink)
391+ goto err;
392+
393+ res = -ENOMEM;
394+ buf = (char *) __get_free_page(GFP_KERNEL);
395+ if (!buf)
396+ goto err;
397+
398+ old_fs = get_fs();
399+ set_fs(get_ds());
400+ /* The cast to a user pointer is valid due to the set_fs() */
401+ res = inode->i_op->readlink(realdentry,
402+ (char __user *)buf, PAGE_SIZE - 1);
403+ set_fs(old_fs);
404+ if (res < 0) {
405+ free_page((unsigned long) buf);
406+ goto err;
407+ }
408+ buf[res] = '\0';
409+
410+ return buf;
411+
412+err:
413+ return ERR_PTR(res);
414+}
415+
416+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
417+{
418+ struct iattr attr = {
419+ .ia_valid = ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
420+ .ia_atime = stat->atime,
421+ .ia_mtime = stat->mtime,
422+ };
423+
424+ return notify_change(upperdentry, &attr);
425+}
426+
427+static int ovl_set_mode(struct dentry *upperdentry, umode_t mode)
428+{
429+ struct iattr attr = {
430+ .ia_valid = ATTR_MODE,
431+ .ia_mode = mode,
432+ };
433+
434+ return notify_change(upperdentry, &attr);
435+}
436+
437+static int ovl_copy_up_locked(struct dentry *upperdir, struct dentry *dentry,
438+ struct path *lowerpath, struct kstat *stat,
439+ const char *link)
440+{
441+ int err;
442+ struct path newpath;
443+ umode_t mode = stat->mode;
444+
445+ /* Can't properly set mode on creation because of the umask */
446+ stat->mode &= S_IFMT;
447+
448+ ovl_path_upper(dentry, &newpath);
449+ WARN_ON(newpath.dentry);
450+ newpath.dentry = ovl_upper_create(upperdir, dentry, stat, link);
451+ if (IS_ERR(newpath.dentry))
452+ return PTR_ERR(newpath.dentry);
453+
454+ if (S_ISREG(stat->mode)) {
455+ err = ovl_copy_up_data(lowerpath, &newpath, stat->size);
456+ if (err)
457+ goto err_remove;
458+ }
459+
460+ err = ovl_copy_up_xattr(lowerpath->dentry, newpath.dentry);
461+ if (err)
462+ goto err_remove;
463+
464+ mutex_lock(&newpath.dentry->d_inode->i_mutex);
465+ if (!S_ISLNK(stat->mode))
466+ err = ovl_set_mode(newpath.dentry, mode);
467+ if (!err)
468+ err = ovl_set_timestamps(newpath.dentry, stat);
469+ mutex_unlock(&newpath.dentry->d_inode->i_mutex);
470+ if (err)
471+ goto err_remove;
472+
473+ ovl_dentry_update(dentry, newpath.dentry);
474+
475+ /*
476+ * Easiest way to get rid of the lower dentry reference is to
477+ * drop this dentry. This is neither needed nor possible for
478+ * directories.
479+ */
480+ if (!S_ISDIR(stat->mode))
481+ d_drop(dentry);
482+
483+ return 0;
484+
485+err_remove:
486+ if (S_ISDIR(stat->mode))
487+ vfs_rmdir(upperdir->d_inode, newpath.dentry);
488+ else
489+ vfs_unlink(upperdir->d_inode, newpath.dentry);
490+
491+ dput(newpath.dentry);
492+
493+ return err;
494+}
495+
496+/*
497+ * Copy up a single dentry
498+ *
499+ * Directory renames only allowed on "pure upper" (already created on
500+ * upper filesystem, never copied up). Directories which are on lower or
501+ * are merged may not be renamed. For these -EXDEV is returned and
502+ * userspace has to deal with it. This means, when copying up a
503+ * directory we can rely on it and ancestors being stable.
504+ *
505+ * Non-directory renames start with copy up of source if necessary. The
506+ * actual rename will only proceed once the copy up was successful. Copy
507+ * up uses upper parent i_mutex for exclusion. Since rename can change
508+ * d_parent it is possible that the copy up will lock the old parent. At
509+ * that point the file will have already been copied up anyway.
510+ */
511+static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
512+ struct path *lowerpath, struct kstat *stat)
513+{
514+ int err;
515+ struct kstat pstat;
516+ struct path parentpath;
517+ struct dentry *upperdir;
518+ const struct cred *old_cred;
519+ struct cred *override_cred;
520+ char *link = NULL;
521+
522+ ovl_path_upper(parent, &parentpath);
523+ upperdir = parentpath.dentry;
524+
525+ err = vfs_getattr(parentpath.mnt, parentpath.dentry, &pstat);
526+ if (err)
527+ return err;
528+
529+ if (S_ISLNK(stat->mode)) {
530+ link = ovl_read_symlink(lowerpath->dentry);
531+ if (IS_ERR(link))
532+ return PTR_ERR(link);
533+ }
534+
535+ err = -ENOMEM;
536+ override_cred = prepare_creds();
537+ if (!override_cred)
538+ goto out_free_link;
539+
540+ override_cred->fsuid = stat->uid;
541+ override_cred->fsgid = stat->gid;
542+ /*
543+ * CAP_SYS_ADMIN for copying up extended attributes
544+ * CAP_DAC_OVERRIDE for create
545+ * CAP_FOWNER for chmod, timestamp update
546+ * CAP_FSETID for chmod
547+ * CAP_MKNOD for mknod
548+ */
549+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
550+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
551+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
552+ cap_raise(override_cred->cap_effective, CAP_FSETID);
553+ cap_raise(override_cred->cap_effective, CAP_MKNOD);
554+ old_cred = override_creds(override_cred);
555+
556+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
557+ if (ovl_path_type(dentry) != OVL_PATH_LOWER) {
558+ err = 0;
559+ } else {
560+ err = ovl_copy_up_locked(upperdir, dentry, lowerpath,
561+ stat, link);
562+ if (!err) {
563+ /* Restore timestamps on parent (best effort) */
564+ ovl_set_timestamps(upperdir, &pstat);
565+ }
566+ }
567+
568+ mutex_unlock(&upperdir->d_inode->i_mutex);
569+
570+ revert_creds(old_cred);
571+ put_cred(override_cred);
572+
573+out_free_link:
574+ if (link)
575+ free_page((unsigned long) link);
576+
577+ return err;
578+}
579+
580+int ovl_copy_up(struct dentry *dentry)
581+{
582+ int err;
583+
584+ err = 0;
585+ while (!err) {
586+ struct dentry *next;
587+ struct dentry *parent;
588+ struct path lowerpath;
589+ struct kstat stat;
590+ enum ovl_path_type type = ovl_path_type(dentry);
591+
592+ if (type != OVL_PATH_LOWER)
593+ break;
594+
595+ next = dget(dentry);
596+ /* find the topmost dentry not yet copied up */
597+ for (;;) {
598+ parent = dget_parent(next);
599+
600+ type = ovl_path_type(parent);
601+ if (type != OVL_PATH_LOWER)
602+ break;
603+
604+ dput(next);
605+ next = parent;
606+ }
607+
608+ ovl_path_lower(next, &lowerpath);
609+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
610+ if (!err)
611+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
612+
613+ dput(parent);
614+ dput(next);
615+ }
616+
617+ return err;
618+}
619+
620+/* Optimize by not copying up the file first and truncating later */
621+int ovl_copy_up_truncate(struct dentry *dentry, loff_t size)
622+{
623+ int err;
624+ struct kstat stat;
625+ struct path lowerpath;
626+ struct dentry *parent = dget_parent(dentry);
627+
628+ err = ovl_copy_up(parent);
629+ if (err)
630+ goto out_dput_parent;
631+
632+ ovl_path_lower(dentry, &lowerpath);
633+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
634+ if (err)
635+ goto out_dput_parent;
636+
637+ if (size < stat.size)
638+ stat.size = size;
639+
640+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
641+
642+out_dput_parent:
643+ dput(parent);
644+ return err;
645+}
646--- /dev/null
647+++ b/fs/overlayfs/dir.c
648@@ -0,0 +1,607 @@
649+/*
650+ *
651+ * Copyright (C) 2011 Novell Inc.
652+ *
653+ * This program is free software; you can redistribute it and/or modify it
654+ * under the terms of the GNU General Public License version 2 as published by
655+ * the Free Software Foundation.
656+ */
657+
658+#include <linux/fs.h>
659+#include <linux/namei.h>
660+#include <linux/xattr.h>
661+#include <linux/security.h>
662+#include "overlayfs.h"
663+
664+static const char *ovl_whiteout_symlink = "(overlay-whiteout)";
665+
666+static struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
667+ struct nameidata *nd)
668+{
669+ int err = ovl_do_lookup(dentry);
670+
671+ if (err)
672+ return ERR_PTR(err);
673+
674+ return NULL;
675+}
676+
677+static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry)
678+{
679+ int err;
680+ struct dentry *newdentry;
681+ const struct cred *old_cred;
682+ struct cred *override_cred;
683+
684+ /* FIXME: recheck lower dentry to see if whiteout is really needed */
685+
686+ err = -ENOMEM;
687+ override_cred = prepare_creds();
688+ if (!override_cred)
689+ goto out;
690+
691+ /*
692+ * CAP_SYS_ADMIN for setxattr
693+ * CAP_DAC_OVERRIDE for symlink creation
694+ * CAP_FOWNER for unlink in sticky directory
695+ */
696+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
697+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
698+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
699+ override_cred->fsuid = 0;
700+ override_cred->fsgid = 0;
701+ old_cred = override_creds(override_cred);
702+
703+ newdentry = lookup_one_len(dentry->d_name.name, upperdir,
704+ dentry->d_name.len);
705+ err = PTR_ERR(newdentry);
706+ if (IS_ERR(newdentry))
707+ goto out_put_cred;
708+
709+ /* Just been removed within the same locked region */
710+ WARN_ON(newdentry->d_inode);
711+
712+ err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink);
713+ if (err)
714+ goto out_dput;
715+
716+ ovl_dentry_version_inc(dentry->d_parent);
717+
718+ err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0);
719+ if (err)
720+ vfs_unlink(upperdir->d_inode, newdentry);
721+
722+out_dput:
723+ dput(newdentry);
724+out_put_cred:
725+ revert_creds(old_cred);
726+ put_cred(override_cred);
727+out:
728+ if (err) {
729+ /*
730+ * There's no way to recover from failure to whiteout.
731+ * What should we do? Log a big fat error and... ?
732+ */
733+ printk(KERN_ERR "overlayfs: ERROR - failed to whiteout '%s'\n",
734+ dentry->d_name.name);
735+ }
736+
737+ return err;
738+}
739+
740+static struct dentry *ovl_lookup_create(struct dentry *upperdir,
741+ struct dentry *template)
742+{
743+ int err;
744+ struct dentry *newdentry;
745+ struct qstr *name = &template->d_name;
746+
747+ newdentry = lookup_one_len(name->name, upperdir, name->len);
748+ if (IS_ERR(newdentry))
749+ return newdentry;
750+
751+ if (newdentry->d_inode) {
752+ const struct cred *old_cred;
753+ struct cred *override_cred;
754+
755+ /* No need to check whiteout if lower parent is non-existent */
756+ err = -EEXIST;
757+ if (!ovl_dentry_lower(template->d_parent))
758+ goto out_dput;
759+
760+ if (!S_ISLNK(newdentry->d_inode->i_mode))
761+ goto out_dput;
762+
763+ err = -ENOMEM;
764+ override_cred = prepare_creds();
765+ if (!override_cred)
766+ goto out_dput;
767+
768+ /*
769+ * CAP_SYS_ADMIN for getxattr
770+ * CAP_FOWNER for unlink in sticky directory
771+ */
772+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
773+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
774+ old_cred = override_creds(override_cred);
775+
776+ err = -EEXIST;
777+ if (ovl_is_whiteout(newdentry))
778+ err = vfs_unlink(upperdir->d_inode, newdentry);
779+
780+ revert_creds(old_cred);
781+ put_cred(override_cred);
782+ if (err)
783+ goto out_dput;
784+
785+ dput(newdentry);
786+ newdentry = lookup_one_len(name->name, upperdir, name->len);
787+ if (IS_ERR(newdentry)) {
788+ ovl_whiteout(upperdir, template);
789+ return newdentry;
790+ }
791+
792+ /*
793+ * Whiteout just been successfully removed, parent
794+ * i_mutex is still held, there's no way the lookup
795+ * could return positive.
796+ */
797+ WARN_ON(newdentry->d_inode);
798+ }
799+
800+ return newdentry;
801+
802+out_dput:
803+ dput(newdentry);
804+ return ERR_PTR(err);
805+}
806+
807+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
808+ struct kstat *stat, const char *link)
809+{
810+ int err;
811+ struct dentry *newdentry;
812+ struct inode *dir = upperdir->d_inode;
813+
814+ newdentry = ovl_lookup_create(upperdir, dentry);
815+ if (IS_ERR(newdentry))
816+ goto out;
817+
818+ switch (stat->mode & S_IFMT) {
819+ case S_IFREG:
820+ err = vfs_create(dir, newdentry, stat->mode, NULL);
821+ break;
822+
823+ case S_IFDIR:
824+ err = vfs_mkdir(dir, newdentry, stat->mode);
825+ break;
826+
827+ case S_IFCHR:
828+ case S_IFBLK:
829+ case S_IFIFO:
830+ case S_IFSOCK:
831+ err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev);
832+ break;
833+
834+ case S_IFLNK:
835+ err = vfs_symlink(dir, newdentry, link);
836+ break;
837+
838+ default:
839+ err = -EPERM;
840+ }
841+ if (err) {
842+ if (ovl_dentry_is_opaque(dentry))
843+ ovl_whiteout(upperdir, dentry);
844+ dput(newdentry);
845+ newdentry = ERR_PTR(err);
846+ } else if (WARN_ON(!newdentry->d_inode)) {
847+ /*
848+ * Not quite sure if non-instantiated dentry is legal or not.
849+ * VFS doesn't seem to care so check and warn here.
850+ */
851+ dput(newdentry);
852+ newdentry = ERR_PTR(-ENOENT);
853+ }
854+
855+out:
856+ return newdentry;
857+
858+}
859+
860+static int ovl_set_opaque(struct dentry *upperdentry)
861+{
862+ int err;
863+ const struct cred *old_cred;
864+ struct cred *override_cred;
865+
866+ override_cred = prepare_creds();
867+ if (!override_cred)
868+ return -ENOMEM;
869+
870+ /* CAP_SYS_ADMIN for setxattr of "trusted" namespace */
871+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
872+ old_cred = override_creds(override_cred);
873+ err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
874+ revert_creds(old_cred);
875+ put_cred(override_cred);
876+
877+ return err;
878+}
879+
880+static int ovl_remove_opaque(struct dentry *upperdentry)
881+{
882+ int err;
883+ const struct cred *old_cred;
884+ struct cred *override_cred;
885+
886+ override_cred = prepare_creds();
887+ if (!override_cred)
888+ return -ENOMEM;
889+
890+ /* CAP_SYS_ADMIN for removexattr of "trusted" namespace */
891+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
892+ old_cred = override_creds(override_cred);
893+ err = vfs_removexattr(upperdentry, ovl_opaque_xattr);
894+ revert_creds(old_cred);
895+ put_cred(override_cred);
896+
897+ return err;
898+}
899+
900+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
901+ struct kstat *stat)
902+{
903+ int err;
904+ enum ovl_path_type type;
905+ struct path realpath;
906+
907+ type = ovl_path_real(dentry, &realpath);
908+ err = vfs_getattr(realpath.mnt, realpath.dentry, stat);
909+ if (err)
910+ return err;
911+
912+ stat->dev = dentry->d_sb->s_dev;
913+ stat->ino = dentry->d_inode->i_ino;
914+
915+ /*
916+ * It's probably not worth it to count subdirs to get the
917+ * correct link count. nlink=1 seems to pacify 'find' and
918+ * other utilities.
919+ */
920+ if (type == OVL_PATH_MERGE)
921+ stat->nlink = 1;
922+
923+ return 0;
924+}
925+
926+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
927+ const char *link)
928+{
929+ int err;
930+ struct dentry *newdentry;
931+ struct dentry *upperdir;
932+ struct inode *inode;
933+ struct kstat stat = {
934+ .mode = mode,
935+ .rdev = rdev,
936+ };
937+
938+ err = -ENOMEM;
939+ inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
940+ if (!inode)
941+ goto out;
942+
943+ err = ovl_copy_up(dentry->d_parent);
944+ if (err)
945+ goto out_iput;
946+
947+ upperdir = ovl_dentry_upper(dentry->d_parent);
948+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
949+
950+ newdentry = ovl_upper_create(upperdir, dentry, &stat, link);
951+ err = PTR_ERR(newdentry);
952+ if (IS_ERR(newdentry))
953+ goto out_unlock;
954+
955+ ovl_dentry_version_inc(dentry->d_parent);
956+ if (ovl_dentry_is_opaque(dentry) && S_ISDIR(mode)) {
957+ err = ovl_set_opaque(newdentry);
958+ if (err) {
959+ vfs_rmdir(upperdir->d_inode, newdentry);
960+ ovl_whiteout(upperdir, dentry);
961+ goto out_dput;
962+ }
963+ }
964+ ovl_dentry_update(dentry, newdentry);
965+ d_instantiate(dentry, inode);
966+ inode = NULL;
967+ newdentry = NULL;
968+ err = 0;
969+
970+out_dput:
971+ dput(newdentry);
972+out_unlock:
973+ mutex_unlock(&upperdir->d_inode->i_mutex);
974+out_iput:
975+ iput(inode);
976+out:
977+ return err;
978+}
979+
980+static int ovl_create(struct inode *dir, struct dentry *dentry, int mode,
981+ struct nameidata *nd)
982+{
983+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
984+}
985+
986+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, int mode)
987+{
988+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
989+}
990+
991+static int ovl_mknod(struct inode *dir, struct dentry *dentry, int mode,
992+ dev_t rdev)
993+{
994+ return ovl_create_object(dentry, mode, rdev, NULL);
995+}
996+
997+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
998+ const char *link)
999+{
1000+ return ovl_create_object(dentry, S_IFLNK, 0, link);
1001+}
1002+
1003+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
1004+{
1005+ int err;
1006+ enum ovl_path_type type;
1007+ struct path realpath;
1008+ struct dentry *upperdir;
1009+
1010+ err = ovl_copy_up(dentry->d_parent);
1011+ if (err)
1012+ return err;
1013+
1014+ upperdir = ovl_dentry_upper(dentry->d_parent);
1015+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1016+ type = ovl_path_real(dentry, &realpath);
1017+ if (type != OVL_PATH_LOWER) {
1018+ err = -ESTALE;
1019+ if (realpath.dentry->d_parent != upperdir)
1020+ goto out_d_drop;
1021+
1022+ /* FIXME: create whiteout up front and rename to target */
1023+
1024+ if (is_dir)
1025+ err = vfs_rmdir(upperdir->d_inode, realpath.dentry);
1026+ else
1027+ err = vfs_unlink(upperdir->d_inode, realpath.dentry);
1028+ if (err)
1029+ goto out_d_drop;
1030+
1031+ ovl_dentry_version_inc(dentry->d_parent);
1032+ }
1033+
1034+ if (type != OVL_PATH_UPPER || ovl_dentry_is_opaque(dentry))
1035+ err = ovl_whiteout(upperdir, dentry);
1036+
1037+ /*
1038+ * Keeping this dentry hashed would mean having to release
1039+ * upperpath/lowerpath, which could only be done if we are the
1040+ * sole user of this dentry. Too tricky... Just unhash for
1041+ * now.
1042+ */
1043+out_d_drop:
1044+ d_drop(dentry);
1045+ mutex_unlock(&upperdir->d_inode->i_mutex);
1046+
1047+ return err;
1048+}
1049+
1050+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
1051+{
1052+ return ovl_do_remove(dentry, false);
1053+}
1054+
1055+
1056+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
1057+{
1058+ int err;
1059+ enum ovl_path_type type;
1060+
1061+ type = ovl_path_type(dentry);
1062+ if (type != OVL_PATH_UPPER) {
1063+ err = ovl_check_empty_and_clear(dentry, type);
1064+ if (err)
1065+ return err;
1066+ }
1067+
1068+ return ovl_do_remove(dentry, true);
1069+}
1070+
1071+static int ovl_link(struct dentry *old, struct inode *newdir,
1072+ struct dentry *new)
1073+{
1074+ int err;
1075+ struct dentry *olddentry;
1076+ struct dentry *newdentry;
1077+ struct dentry *upperdir;
1078+
1079+ err = ovl_copy_up(old);
1080+ if (err)
1081+ goto out;
1082+
1083+ err = ovl_copy_up(new->d_parent);
1084+ if (err)
1085+ goto out;
1086+
1087+ upperdir = ovl_dentry_upper(new->d_parent);
1088+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1089+ newdentry = ovl_lookup_create(upperdir, new);
1090+ err = PTR_ERR(newdentry);
1091+ if (IS_ERR(newdentry))
1092+ goto out_unlock;
1093+
1094+ olddentry = ovl_dentry_upper(old);
1095+ err = vfs_link(olddentry, upperdir->d_inode, newdentry);
1096+ if (!err) {
1097+ if (WARN_ON(!newdentry->d_inode)) {
1098+ dput(newdentry);
1099+ err = -ENOENT;
1100+ goto out_unlock;
1101+ }
1102+
1103+ ovl_dentry_version_inc(new->d_parent);
1104+ ovl_dentry_update(new, newdentry);
1105+
1106+ ihold(old->d_inode);
1107+ d_instantiate(new, old->d_inode);
1108+ } else {
1109+ if (ovl_dentry_is_opaque(new))
1110+ ovl_whiteout(upperdir, new);
1111+ dput(newdentry);
1112+ }
1113+out_unlock:
1114+ mutex_unlock(&upperdir->d_inode->i_mutex);
1115+out:
1116+ return err;
1117+
1118+}
1119+
1120+static int ovl_rename(struct inode *olddir, struct dentry *old,
1121+ struct inode *newdir, struct dentry *new)
1122+{
1123+ int err;
1124+ enum ovl_path_type old_type;
1125+ enum ovl_path_type new_type;
1126+ struct dentry *old_upperdir;
1127+ struct dentry *new_upperdir;
1128+ struct dentry *olddentry;
1129+ struct dentry *newdentry;
1130+ struct dentry *trap;
1131+ bool old_opaque;
1132+ bool new_opaque;
1133+ bool new_create = false;
1134+ bool is_dir = S_ISDIR(old->d_inode->i_mode);
1135+
1136+ /* Don't copy up directory trees */
1137+ old_type = ovl_path_type(old);
1138+ if (old_type != OVL_PATH_UPPER && is_dir)
1139+ return -EXDEV;
1140+
1141+ if (new->d_inode) {
1142+ new_type = ovl_path_type(new);
1143+
1144+ if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
1145+ if (ovl_dentry_lower(old)->d_inode ==
1146+ ovl_dentry_lower(new)->d_inode)
1147+ return 0;
1148+ }
1149+ if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
1150+ if (ovl_dentry_upper(old)->d_inode ==
1151+ ovl_dentry_upper(new)->d_inode)
1152+ return 0;
1153+ }
1154+
1155+ if (new_type != OVL_PATH_UPPER &&
1156+ S_ISDIR(new->d_inode->i_mode)) {
1157+ err = ovl_check_empty_and_clear(new, new_type);
1158+ if (err)
1159+ return err;
1160+ }
1161+ } else {
1162+ new_type = OVL_PATH_UPPER;
1163+ }
1164+
1165+ err = ovl_copy_up(old);
1166+ if (err)
1167+ return err;
1168+
1169+ err = ovl_copy_up(new->d_parent);
1170+ if (err)
1171+ return err;
1172+
1173+ old_upperdir = ovl_dentry_upper(old->d_parent);
1174+ new_upperdir = ovl_dentry_upper(new->d_parent);
1175+
1176+ trap = lock_rename(new_upperdir, old_upperdir);
1177+
1178+ olddentry = ovl_dentry_upper(old);
1179+ newdentry = ovl_dentry_upper(new);
1180+ if (newdentry) {
1181+ dget(newdentry);
1182+ } else {
1183+ new_create = true;
1184+ newdentry = ovl_lookup_create(new_upperdir, new);
1185+ err = PTR_ERR(newdentry);
1186+ if (IS_ERR(newdentry))
1187+ goto out_unlock;
1188+ }
1189+
1190+ err = -ESTALE;
1191+ if (olddentry->d_parent != old_upperdir)
1192+ goto out_dput;
1193+ if (newdentry->d_parent != new_upperdir)
1194+ goto out_dput;
1195+ if (olddentry == trap)
1196+ goto out_dput;
1197+ if (newdentry == trap)
1198+ goto out_dput;
1199+
1200+ old_opaque = ovl_dentry_is_opaque(old);
1201+ new_opaque = ovl_dentry_is_opaque(new) || new_type != OVL_PATH_UPPER;
1202+
1203+ if (is_dir && !old_opaque && new_opaque) {
1204+ err = ovl_set_opaque(olddentry);
1205+ if (err)
1206+ goto out_dput;
1207+ }
1208+
1209+ err = vfs_rename(old_upperdir->d_inode, olddentry,
1210+ new_upperdir->d_inode, newdentry);
1211+
1212+ if (err) {
1213+ if (new_create && ovl_dentry_is_opaque(new))
1214+ ovl_whiteout(new_upperdir, new);
1215+ if (is_dir && !old_opaque && new_opaque)
1216+ ovl_remove_opaque(olddentry);
1217+ goto out_dput;
1218+ }
1219+
1220+ if (old_type != OVL_PATH_UPPER || old_opaque)
1221+ err = ovl_whiteout(old_upperdir, old);
1222+ if (is_dir && old_opaque && !new_opaque)
1223+ ovl_remove_opaque(olddentry);
1224+
1225+ if (old_opaque != new_opaque)
1226+ ovl_dentry_set_opaque(old, new_opaque);
1227+
1228+ ovl_dentry_version_inc(old->d_parent);
1229+ ovl_dentry_version_inc(new->d_parent);
1230+
1231+out_dput:
1232+ dput(newdentry);
1233+out_unlock:
1234+ unlock_rename(new_upperdir, old_upperdir);
1235+ return err;
1236+}
1237+
1238+const struct inode_operations ovl_dir_inode_operations = {
1239+ .lookup = ovl_lookup,
1240+ .mkdir = ovl_mkdir,
1241+ .symlink = ovl_symlink,
1242+ .unlink = ovl_unlink,
1243+ .rmdir = ovl_rmdir,
1244+ .rename = ovl_rename,
1245+ .link = ovl_link,
1246+ .setattr = ovl_setattr,
1247+ .create = ovl_create,
1248+ .mknod = ovl_mknod,
1249+ .permission = ovl_permission,
1250+ .getattr = ovl_dir_getattr,
1251+ .setxattr = ovl_setxattr,
1252+ .getxattr = ovl_getxattr,
1253+ .listxattr = ovl_listxattr,
1254+ .removexattr = ovl_removexattr,
1255+};
1256--- /dev/null
1257+++ b/fs/overlayfs/inode.c
1258@@ -0,0 +1,375 @@
1259+/*
1260+ *
1261+ * Copyright (C) 2011 Novell Inc.
1262+ *
1263+ * This program is free software; you can redistribute it and/or modify it
1264+ * under the terms of the GNU General Public License version 2 as published by
1265+ * the Free Software Foundation.
1266+ */
1267+
1268+#include <linux/fs.h>
1269+#include <linux/slab.h>
1270+#include <linux/xattr.h>
1271+#include "overlayfs.h"
1272+
1273+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
1274+{
1275+ struct dentry *upperdentry;
1276+ int err;
1277+
1278+ if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry))
1279+ err = ovl_copy_up_truncate(dentry, attr->ia_size);
1280+ else
1281+ err = ovl_copy_up(dentry);
1282+ if (err)
1283+ return err;
1284+
1285+ upperdentry = ovl_dentry_upper(dentry);
1286+
1287+ if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
1288+ attr->ia_valid &= ~ATTR_MODE;
1289+
1290+ mutex_lock(&upperdentry->d_inode->i_mutex);
1291+ err = notify_change(upperdentry, attr);
1292+ mutex_unlock(&upperdentry->d_inode->i_mutex);
1293+
1294+ return err;
1295+}
1296+
1297+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
1298+ struct kstat *stat)
1299+{
1300+ struct path realpath;
1301+
1302+ ovl_path_real(dentry, &realpath);
1303+ return vfs_getattr(realpath.mnt, realpath.dentry, stat);
1304+}
1305+
1306+int ovl_permission(struct inode *inode, int mask, unsigned int flags)
1307+{
1308+ struct ovl_entry *oe;
1309+ struct dentry *alias = NULL;
1310+ struct inode *realinode;
1311+ struct dentry *realdentry;
1312+ bool is_upper;
1313+ int err;
1314+
1315+ if (S_ISDIR(inode->i_mode)) {
1316+ oe = inode->i_private;
1317+ } else if (flags & IPERM_FLAG_RCU) {
1318+ return -ECHILD;
1319+ } else {
1320+ /*
1321+ * For non-directories find an alias and get the info
1322+ * from there.
1323+ */
1324+ spin_lock(&inode->i_lock);
1325+ if (WARN_ON(list_empty(&inode->i_dentry))) {
1326+ spin_unlock(&inode->i_lock);
1327+ return -ENOENT;
1328+ }
1329+ alias = list_entry(inode->i_dentry.next, struct dentry, d_alias);
1330+ dget(alias);
1331+ spin_unlock(&inode->i_lock);
1332+ oe = alias->d_fsdata;
1333+ }
1334+
1335+ realdentry = ovl_entry_real(oe, &is_upper);
1336+
1337+ /* Careful in RCU walk mode */
1338+ realinode = ACCESS_ONCE(realdentry->d_inode);
1339+ if (!realinode) {
1340+ WARN_ON(!(flags & IPERM_FLAG_RCU));
1341+ err = -ENOENT;
1342+ goto out_dput;
1343+ }
1344+
1345+ if (mask & MAY_WRITE) {
1346+ umode_t mode = realinode->i_mode;
1347+
1348+ /*
1349+ * Writes will always be redirected to upper layer, so
1350+ * ignore lower layer being read-only.
1351+ */
1352+ err = -EROFS;
1353+ if (is_upper && IS_RDONLY(realinode) &&
1354+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1355+ goto out_dput;
1356+
1357+ /*
1358+ * Nobody gets write access to an immutable file.
1359+ */
1360+ err = -EACCES;
1361+ if (IS_IMMUTABLE(realinode))
1362+ goto out_dput;
1363+ }
1364+
1365+ if (realinode->i_op->permission)
1366+ err = realinode->i_op->permission(realinode, mask, flags);
1367+ else
1368+ err = generic_permission(realinode, mask, flags,
1369+ realinode->i_op->check_acl);
1370+out_dput:
1371+ dput(alias);
1372+ return err;
1373+}
1374+
1375+
1376+struct ovl_link_data {
1377+ struct dentry *realdentry;
1378+ void *cookie;
1379+};
1380+
1381+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
1382+{
1383+ void *ret;
1384+ struct dentry *realdentry;
1385+ struct inode *realinode;
1386+
1387+ realdentry = ovl_dentry_real(dentry);
1388+ realinode = realdentry->d_inode;
1389+
1390+ if (WARN_ON(!realinode->i_op->follow_link))
1391+ return ERR_PTR(-EPERM);
1392+
1393+ ret = realinode->i_op->follow_link(realdentry, nd);
1394+ if (IS_ERR(ret))
1395+ return ret;
1396+
1397+ if (realinode->i_op->put_link) {
1398+ struct ovl_link_data *data;
1399+
1400+ data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
1401+ if (!data) {
1402+ realinode->i_op->put_link(realdentry, nd, ret);
1403+ return ERR_PTR(-ENOMEM);
1404+ }
1405+ data->realdentry = realdentry;
1406+ data->cookie = ret;
1407+
1408+ return data;
1409+ } else {
1410+ return NULL;
1411+ }
1412+}
1413+
1414+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
1415+{
1416+ struct inode *realinode;
1417+ struct ovl_link_data *data = c;
1418+
1419+ if (!data)
1420+ return;
1421+
1422+ realinode = data->realdentry->d_inode;
1423+ realinode->i_op->put_link(data->realdentry, nd, data->cookie);
1424+ kfree(data);
1425+}
1426+
1427+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
1428+{
1429+ struct path realpath;
1430+ struct inode *realinode;
1431+
1432+ ovl_path_real(dentry, &realpath);
1433+ realinode = realpath.dentry->d_inode;
1434+
1435+ if (!realinode->i_op->readlink)
1436+ return -EINVAL;
1437+
1438+ touch_atime(realpath.mnt, realpath.dentry);
1439+
1440+ return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
1441+}
1442+
1443+
1444+static bool ovl_is_private_xattr(const char *name)
1445+{
1446+ return strncmp(name, "trusted.overlay.", 14) == 0;
1447+}
1448+
1449+int ovl_setxattr(struct dentry *dentry, const char *name,
1450+ const void *value, size_t size, int flags)
1451+{
1452+ int err;
1453+ struct dentry *upperdentry;
1454+
1455+ if (ovl_is_private_xattr(name))
1456+ return -EPERM;
1457+
1458+ err = ovl_copy_up(dentry);
1459+ if (err)
1460+ return err;
1461+
1462+ upperdentry = ovl_dentry_upper(dentry);
1463+ return vfs_setxattr(upperdentry, name, value, size, flags);
1464+}
1465+
1466+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
1467+ void *value, size_t size)
1468+{
1469+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
1470+ ovl_is_private_xattr(name))
1471+ return -ENODATA;
1472+
1473+ return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
1474+}
1475+
1476+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
1477+{
1478+ ssize_t res;
1479+ int off;
1480+
1481+ res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
1482+ if (res <= 0 || size == 0)
1483+ return res;
1484+
1485+ if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
1486+ return res;
1487+
1488+ /* filter out private xattrs */
1489+ for (off = 0; off < res;) {
1490+ char *s = list + off;
1491+ size_t slen = strlen(s) + 1;
1492+
1493+ BUG_ON(off + slen > res);
1494+
1495+ if (ovl_is_private_xattr(s)) {
1496+ res -= slen;
1497+ memmove(s, s + slen, res - off);
1498+ } else {
1499+ off += slen;
1500+ }
1501+ }
1502+
1503+ return res;
1504+}
1505+
1506+int ovl_removexattr(struct dentry *dentry, const char *name)
1507+{
1508+ int err;
1509+ struct path realpath;
1510+ enum ovl_path_type type;
1511+
1512+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
1513+ ovl_is_private_xattr(name))
1514+ return -ENODATA;
1515+
1516+ type = ovl_path_real(dentry, &realpath);
1517+ if (type == OVL_PATH_LOWER) {
1518+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
1519+ if (err < 0)
1520+ return err;
1521+
1522+ err = ovl_copy_up(dentry);
1523+ if (err)
1524+ return err;
1525+
1526+ ovl_path_upper(dentry, &realpath);
1527+ }
1528+
1529+ return vfs_removexattr(realpath.dentry, name);
1530+}
1531+
1532+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
1533+ struct dentry *realdentry)
1534+{
1535+ if (type != OVL_PATH_LOWER)
1536+ return false;
1537+
1538+ if (special_file(realdentry->d_inode->i_mode))
1539+ return false;
1540+
1541+ if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
1542+ return false;
1543+
1544+ return true;
1545+}
1546+
1547+static struct file *ovl_open(struct dentry *dentry, int flags,
1548+ const struct cred *cred)
1549+{
1550+ int err;
1551+ struct path realpath;
1552+ enum ovl_path_type type;
1553+
1554+ type = ovl_path_real(dentry, &realpath);
1555+ if (ovl_open_need_copy_up(flags, type, realpath.dentry)) {
1556+ if (flags & O_TRUNC)
1557+ err = ovl_copy_up_truncate(dentry, 0);
1558+ else
1559+ err = ovl_copy_up(dentry);
1560+ if (err)
1561+ return ERR_PTR(err);
1562+
1563+ ovl_path_upper(dentry, &realpath);
1564+ }
1565+
1566+ return vfs_open(&realpath, flags, cred);
1567+}
1568+
1569+static const struct inode_operations ovl_file_inode_operations = {
1570+ .setattr = ovl_setattr,
1571+ .permission = ovl_permission,
1572+ .getattr = ovl_getattr,
1573+ .setxattr = ovl_setxattr,
1574+ .getxattr = ovl_getxattr,
1575+ .listxattr = ovl_listxattr,
1576+ .removexattr = ovl_removexattr,
1577+ .open = ovl_open,
1578+};
1579+
1580+static const struct inode_operations ovl_symlink_inode_operations = {
1581+ .setattr = ovl_setattr,
1582+ .follow_link = ovl_follow_link,
1583+ .put_link = ovl_put_link,
1584+ .readlink = ovl_readlink,
1585+ .getattr = ovl_getattr,
1586+ .setxattr = ovl_setxattr,
1587+ .getxattr = ovl_getxattr,
1588+ .listxattr = ovl_listxattr,
1589+ .removexattr = ovl_removexattr,
1590+};
1591+
1592+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
1593+ struct ovl_entry *oe)
1594+{
1595+ struct inode *inode;
1596+
1597+ inode = new_inode(sb);
1598+ if (!inode)
1599+ return NULL;
1600+
1601+ mode &= S_IFMT;
1602+
1603+ inode->i_ino = get_next_ino();
1604+ inode->i_mode = mode;
1605+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
1606+
1607+ switch (mode) {
1608+ case S_IFDIR:
1609+ inode->i_private = oe;
1610+ inode->i_op = &ovl_dir_inode_operations;
1611+ inode->i_fop = &ovl_dir_operations;
1612+ break;
1613+
1614+ case S_IFLNK:
1615+ inode->i_op = &ovl_symlink_inode_operations;
1616+ break;
1617+
1618+ case S_IFREG:
1619+ case S_IFSOCK:
1620+ case S_IFBLK:
1621+ case S_IFCHR:
1622+ case S_IFIFO:
1623+ inode->i_op = &ovl_file_inode_operations;
1624+ break;
1625+
1626+ default:
1627+ WARN(1, "illegal file type: %i\n", mode);
1628+ inode = NULL;
1629+ }
1630+
1631+ return inode;
1632+
1633+}
1634--- /dev/null
1635+++ b/fs/overlayfs/overlayfs.h
1636@@ -0,0 +1,62 @@
1637+/*
1638+ *
1639+ * Copyright (C) 2011 Novell Inc.
1640+ *
1641+ * This program is free software; you can redistribute it and/or modify it
1642+ * under the terms of the GNU General Public License version 2 as published by
1643+ * the Free Software Foundation.
1644+ */
1645+
1646+struct ovl_entry;
1647+
1648+enum ovl_path_type {
1649+ OVL_PATH_UPPER,
1650+ OVL_PATH_MERGE,
1651+ OVL_PATH_LOWER,
1652+};
1653+
1654+extern const char *ovl_opaque_xattr;
1655+extern const char *ovl_whiteout_xattr;
1656+extern const struct dentry_operations ovl_dentry_operations;
1657+
1658+enum ovl_path_type ovl_path_type(struct dentry *dentry);
1659+u64 ovl_dentry_version_get(struct dentry *dentry);
1660+void ovl_dentry_version_inc(struct dentry *dentry);
1661+void ovl_path_upper(struct dentry *dentry, struct path *path);
1662+void ovl_path_lower(struct dentry *dentry, struct path *path);
1663+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
1664+struct dentry *ovl_dentry_upper(struct dentry *dentry);
1665+struct dentry *ovl_dentry_lower(struct dentry *dentry);
1666+struct dentry *ovl_dentry_real(struct dentry *dentry);
1667+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
1668+bool ovl_dentry_is_opaque(struct dentry *dentry);
1669+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
1670+bool ovl_is_whiteout(struct dentry *dentry);
1671+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
1672+int ovl_do_lookup(struct dentry *dentry);
1673+
1674+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
1675+ struct kstat *stat, const char *link);
1676+
1677+/* readdir.c */
1678+extern const struct file_operations ovl_dir_operations;
1679+int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type);
1680+
1681+/* inode.c */
1682+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
1683+int ovl_permission(struct inode *inode, int mask, unsigned int flags);
1684+int ovl_setxattr(struct dentry *dentry, const char *name,
1685+ const void *value, size_t size, int flags);
1686+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
1687+ void *value, size_t size);
1688+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
1689+int ovl_removexattr(struct dentry *dentry, const char *name);
1690+
1691+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
1692+ struct ovl_entry *oe);
1693+/* dir.c */
1694+extern const struct inode_operations ovl_dir_inode_operations;
1695+
1696+/* copy_up.c */
1697+int ovl_copy_up(struct dentry *dentry);
1698+int ovl_copy_up_truncate(struct dentry *dentry, loff_t size);
1699--- /dev/null
1700+++ b/fs/overlayfs/readdir.c
1701@@ -0,0 +1,558 @@
1702+/*
1703+ *
1704+ * Copyright (C) 2011 Novell Inc.
1705+ *
1706+ * This program is free software; you can redistribute it and/or modify it
1707+ * under the terms of the GNU General Public License version 2 as published by
1708+ * the Free Software Foundation.
1709+ */
1710+
1711+#include <linux/fs.h>
1712+#include <linux/slab.h>
1713+#include <linux/namei.h>
1714+#include <linux/file.h>
1715+#include <linux/xattr.h>
1716+#include <linux/rbtree.h>
1717+#include <linux/security.h>
1718+#include "overlayfs.h"
1719+
1720+struct ovl_cache_entry {
1721+ const char *name;
1722+ unsigned int len;
1723+ unsigned int type;
1724+ u64 ino;
1725+ bool is_whiteout;
1726+ struct list_head l_node;
1727+ struct rb_node node;
1728+};
1729+
1730+struct ovl_readdir_data {
1731+ struct rb_root *root;
1732+ struct list_head *list;
1733+ struct list_head *middle;
1734+ struct dentry *dir;
1735+ int count;
1736+ int err;
1737+};
1738+
1739+struct ovl_dir_file {
1740+ bool is_real;
1741+ bool is_cached;
1742+ struct list_head cursor;
1743+ u64 cache_version;
1744+ struct list_head cache;
1745+ struct file *realfile;
1746+};
1747+
1748+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
1749+{
1750+ return container_of(n, struct ovl_cache_entry, node);
1751+}
1752+
1753+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
1754+ const char *name, int len)
1755+{
1756+ struct rb_node *node = root->rb_node;
1757+ int cmp;
1758+
1759+ while (node) {
1760+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
1761+
1762+ cmp = strncmp(name, p->name, len);
1763+ if (cmp > 0)
1764+ node = p->node.rb_right;
1765+ else if (cmp < 0 || len < p->len)
1766+ node = p->node.rb_left;
1767+ else
1768+ return p;
1769+ }
1770+
1771+ return NULL;
1772+}
1773+
1774+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
1775+ u64 ino, unsigned int d_type)
1776+{
1777+ struct ovl_cache_entry *p;
1778+
1779+ p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL);
1780+ if (p) {
1781+ char *name_copy = (char *) (p + 1);
1782+ memcpy(name_copy, name, len);
1783+ name_copy[len] = '\0';
1784+ p->name = name_copy;
1785+ p->len = len;
1786+ p->type = d_type;
1787+ p->ino = ino;
1788+ p->is_whiteout = false;
1789+ }
1790+
1791+ return p;
1792+}
1793+
1794+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
1795+ const char *name, int len, u64 ino,
1796+ unsigned int d_type)
1797+{
1798+ struct rb_node **newp = &rdd->root->rb_node;
1799+ struct rb_node *parent = NULL;
1800+ struct ovl_cache_entry *p;
1801+
1802+ while (*newp) {
1803+ int cmp;
1804+ struct ovl_cache_entry *tmp;
1805+
1806+ parent = *newp;
1807+ tmp = ovl_cache_entry_from_node(*newp);
1808+ cmp = strncmp(name, tmp->name, len);
1809+ if (cmp > 0)
1810+ newp = &tmp->node.rb_right;
1811+ else if (cmp < 0 || len < tmp->len)
1812+ newp = &tmp->node.rb_left;
1813+ else
1814+ return 0;
1815+ }
1816+
1817+ p = ovl_cache_entry_new(name, len, ino, d_type);
1818+ if (p == NULL)
1819+ return -ENOMEM;
1820+
1821+ list_add_tail(&p->l_node, rdd->list);
1822+ rb_link_node(&p->node, parent, newp);
1823+ rb_insert_color(&p->node, rdd->root);
1824+
1825+ return 0;
1826+}
1827+
1828+static int ovl_fill_lower(void *buf, const char *name, int namelen,
1829+ loff_t offset, u64 ino, unsigned int d_type)
1830+{
1831+ struct ovl_readdir_data *rdd = buf;
1832+ struct ovl_cache_entry *p;
1833+
1834+ rdd->count++;
1835+ p = ovl_cache_entry_find(rdd->root, name, namelen);
1836+ if (p) {
1837+ list_move_tail(&p->l_node, rdd->middle);
1838+ } else {
1839+ p = ovl_cache_entry_new(name, namelen, ino, d_type);
1840+ if (p == NULL)
1841+ rdd->err = -ENOMEM;
1842+ else
1843+ list_add_tail(&p->l_node, rdd->middle);
1844+ }
1845+
1846+ return rdd->err;
1847+}
1848+
1849+static void ovl_cache_free(struct list_head *list)
1850+{
1851+ struct ovl_cache_entry *p;
1852+ struct ovl_cache_entry *n;
1853+
1854+ list_for_each_entry_safe(p, n, list, l_node)
1855+ kfree(p);
1856+
1857+ INIT_LIST_HEAD(list);
1858+}
1859+
1860+static int ovl_fill_upper(void *buf, const char *name, int namelen,
1861+ loff_t offset, u64 ino, unsigned int d_type)
1862+{
1863+ struct ovl_readdir_data *rdd = buf;
1864+
1865+ rdd->count++;
1866+ return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
1867+}
1868+
1869+static int ovl_dir_read(struct path *realpath, struct ovl_readdir_data *rdd,
1870+ filldir_t filler)
1871+{
1872+ struct file *realfile;
1873+ int err;
1874+
1875+ realfile = vfs_open(realpath, O_RDONLY | O_DIRECTORY, current_cred());
1876+ if (IS_ERR(realfile))
1877+ return PTR_ERR(realfile);
1878+
1879+ do {
1880+ rdd->count = 0;
1881+ rdd->err = 0;
1882+ err = vfs_readdir(realfile, filler, rdd);
1883+ if (err >= 0)
1884+ err = rdd->err;
1885+ } while (!err && rdd->count);
1886+ fput(realfile);
1887+
1888+ return 0;
1889+}
1890+
1891+static void ovl_dir_reset(struct file *file)
1892+{
1893+ struct ovl_dir_file *od = file->private_data;
1894+ enum ovl_path_type type = ovl_path_type(file->f_path.dentry);
1895+
1896+ if (ovl_dentry_version_get(file->f_path.dentry) != od->cache_version) {
1897+ list_del_init(&od->cursor);
1898+ ovl_cache_free(&od->cache);
1899+ od->is_cached = false;
1900+ }
1901+ WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
1902+ if (od->is_real && type == OVL_PATH_MERGE) {
1903+ fput(od->realfile);
1904+ od->realfile = NULL;
1905+ od->is_real = false;
1906+ }
1907+}
1908+
1909+static int ovl_dir_mark_whiteouts(struct ovl_readdir_data *rdd)
1910+{
1911+ struct ovl_cache_entry *p;
1912+ struct dentry *dentry;
1913+ const struct cred *old_cred;
1914+ struct cred *override_cred;
1915+
1916+ override_cred = prepare_creds();
1917+ if (!override_cred) {
1918+ ovl_cache_free(rdd->list);
1919+ return -ENOMEM;
1920+ }
1921+
1922+ /*
1923+ * CAP_SYS_ADMIN for getxattr
1924+ * CAP_DAC_OVERRIDE for lookup
1925+ */
1926+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1927+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
1928+ old_cred = override_creds(override_cred);
1929+
1930+ mutex_lock(&rdd->dir->d_inode->i_mutex);
1931+ list_for_each_entry(p, rdd->list, l_node) {
1932+ if (p->type != DT_LNK)
1933+ continue;
1934+
1935+ dentry = lookup_one_len(p->name, rdd->dir, p->len);
1936+ if (IS_ERR(dentry))
1937+ continue;
1938+
1939+ p->is_whiteout = ovl_is_whiteout(dentry);
1940+ dput(dentry);
1941+ }
1942+ mutex_unlock(&rdd->dir->d_inode->i_mutex);
1943+
1944+ revert_creds(old_cred);
1945+ put_cred(override_cred);
1946+
1947+ return 0;
1948+}
1949+
1950+static int ovl_dir_read_merged(struct path *upperpath, struct path *lowerpath,
1951+ struct ovl_readdir_data *rdd)
1952+{
1953+ int err;
1954+ struct rb_root root = RB_ROOT;
1955+ struct list_head middle;
1956+
1957+ rdd->root = &root;
1958+ if (upperpath->dentry) {
1959+ rdd->dir = upperpath->dentry;
1960+ err = ovl_dir_read(upperpath, rdd, ovl_fill_upper);
1961+ if (err)
1962+ goto out;
1963+
1964+ err = ovl_dir_mark_whiteouts(rdd);
1965+ if (err)
1966+ goto out;
1967+ }
1968+ /*
1969+ * Insert lowerpath entries before upperpath ones, this allows
1970+ * offsets to be reasonably constant
1971+ */
1972+ list_add(&middle, rdd->list);
1973+ rdd->middle = &middle;
1974+ err = ovl_dir_read(lowerpath, rdd, ovl_fill_lower);
1975+ list_del(&middle);
1976+out:
1977+ rdd->root = NULL;
1978+
1979+ return err;
1980+}
1981+
1982+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
1983+{
1984+ struct list_head *l;
1985+ loff_t off;
1986+
1987+ l = od->cache.next;
1988+ for (off = 0; off < pos; off++) {
1989+ if (l == &od->cache)
1990+ break;
1991+ l = l->next;
1992+ }
1993+ list_move_tail(&od->cursor, l);
1994+}
1995+
1996+static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
1997+{
1998+ struct ovl_dir_file *od = file->private_data;
1999+ int res;
2000+
2001+ if (!file->f_pos)
2002+ ovl_dir_reset(file);
2003+
2004+ if (od->is_real) {
2005+ res = vfs_readdir(od->realfile, filler, buf);
2006+ file->f_pos = od->realfile->f_pos;
2007+
2008+ return res;
2009+ }
2010+
2011+ if (!od->is_cached) {
2012+ struct path lowerpath;
2013+ struct path upperpath;
2014+ struct ovl_readdir_data rdd = { .list = &od->cache };
2015+
2016+ ovl_path_lower(file->f_path.dentry, &lowerpath);
2017+ ovl_path_upper(file->f_path.dentry, &upperpath);
2018+
2019+ res = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2020+ if (res) {
2021+ ovl_cache_free(rdd.list);
2022+ return res;
2023+ }
2024+
2025+ od->cache_version = ovl_dentry_version_get(file->f_path.dentry);
2026+ od->is_cached = true;
2027+
2028+ ovl_seek_cursor(od, file->f_pos);
2029+ }
2030+
2031+ while (od->cursor.next != &od->cache) {
2032+ int over;
2033+ loff_t off;
2034+ struct ovl_cache_entry *p;
2035+
2036+ p = list_entry(od->cursor.next, struct ovl_cache_entry, l_node);
2037+ off = file->f_pos;
2038+ if (!p->is_whiteout) {
2039+ over = filler(buf, p->name, p->len, off, p->ino, p->type);
2040+ if (over)
2041+ break;
2042+ }
2043+ file->f_pos++;
2044+ list_move(&od->cursor, &p->l_node);
2045+ }
2046+
2047+ return 0;
2048+}
2049+
2050+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
2051+{
2052+ loff_t res;
2053+ struct ovl_dir_file *od = file->private_data;
2054+
2055+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
2056+ if (!file->f_pos)
2057+ ovl_dir_reset(file);
2058+
2059+ if (od->is_real) {
2060+ res = vfs_llseek(od->realfile, offset, origin);
2061+ file->f_pos = od->realfile->f_pos;
2062+ } else {
2063+ res = -EINVAL;
2064+
2065+ switch (origin) {
2066+ case SEEK_CUR:
2067+ offset += file->f_pos;
2068+ break;
2069+ case SEEK_SET:
2070+ break;
2071+ default:
2072+ goto out_unlock;
2073+ }
2074+ if (offset < 0)
2075+ goto out_unlock;
2076+
2077+ if (offset != file->f_pos) {
2078+ file->f_pos = offset;
2079+ if (od->is_cached)
2080+ ovl_seek_cursor(od, offset);
2081+ }
2082+ res = offset;
2083+ }
2084+out_unlock:
2085+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
2086+
2087+ return res;
2088+}
2089+
2090+static int ovl_dir_fsync(struct file *file, int datasync)
2091+{
2092+ struct ovl_dir_file *od = file->private_data;
2093+
2094+ /* May need to reopen directory if it got copied up */
2095+ if (!od->realfile) {
2096+ struct path upperpath;
2097+
2098+ ovl_path_upper(file->f_path.dentry, &upperpath);
2099+ od->realfile = vfs_open(&upperpath, O_RDONLY, current_cred());
2100+ if (IS_ERR(od->realfile))
2101+ return PTR_ERR(od->realfile);
2102+ }
2103+
2104+ return vfs_fsync(od->realfile, datasync);
2105+}
2106+
2107+static int ovl_dir_release(struct inode *inode, struct file *file)
2108+{
2109+ struct ovl_dir_file *od = file->private_data;
2110+
2111+ list_del(&od->cursor);
2112+ ovl_cache_free(&od->cache);
2113+ if (od->realfile)
2114+ fput(od->realfile);
2115+ kfree(od);
2116+
2117+ return 0;
2118+}
2119+
2120+static int ovl_dir_open(struct inode *inode, struct file *file)
2121+{
2122+ struct path realpath;
2123+ struct file *realfile;
2124+ struct ovl_dir_file *od;
2125+ enum ovl_path_type type;
2126+
2127+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
2128+ if (!od)
2129+ return -ENOMEM;
2130+
2131+ type = ovl_path_real(file->f_path.dentry, &realpath);
2132+ realfile = vfs_open(&realpath, file->f_flags, current_cred());
2133+ if (IS_ERR(realfile)) {
2134+ kfree(od);
2135+ return PTR_ERR(realfile);
2136+ }
2137+ INIT_LIST_HEAD(&od->cache);
2138+ INIT_LIST_HEAD(&od->cursor);
2139+ od->is_cached = false;
2140+ od->realfile = realfile;
2141+ od->is_real = (type != OVL_PATH_MERGE);
2142+ file->private_data = od;
2143+
2144+ return 0;
2145+}
2146+
2147+const struct file_operations ovl_dir_operations = {
2148+ .read = generic_read_dir,
2149+ .open = ovl_dir_open,
2150+ .readdir = ovl_readdir,
2151+ .llseek = ovl_dir_llseek,
2152+ .fsync = ovl_dir_fsync,
2153+ .release = ovl_dir_release,
2154+};
2155+
2156+static int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
2157+{
2158+ int err;
2159+ struct path lowerpath;
2160+ struct path upperpath;
2161+ struct ovl_cache_entry *p;
2162+ struct ovl_readdir_data rdd = { .list = list };
2163+
2164+ ovl_path_upper(dentry, &upperpath);
2165+ ovl_path_lower(dentry, &lowerpath);
2166+
2167+ err = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2168+ if (err)
2169+ return err;
2170+
2171+ err = 0;
2172+
2173+ list_for_each_entry(p, list, l_node) {
2174+ if (p->is_whiteout)
2175+ continue;
2176+
2177+ if (p->name[0] == '.') {
2178+ if (p->len == 1)
2179+ continue;
2180+ if (p->len == 2 && p->name[1] == '.')
2181+ continue;
2182+ }
2183+ err = -ENOTEMPTY;
2184+ break;
2185+ }
2186+
2187+ return err;
2188+}
2189+
2190+static int ovl_remove_whiteouts(struct dentry *dir, struct list_head *list)
2191+{
2192+ struct path upperpath;
2193+ struct dentry *upperdir;
2194+ struct ovl_cache_entry *p;
2195+ const struct cred *old_cred;
2196+ struct cred *override_cred;
2197+ int err;
2198+
2199+ ovl_path_upper(dir, &upperpath);
2200+ upperdir = upperpath.dentry;
2201+
2202+ override_cred = prepare_creds();
2203+ if (!override_cred)
2204+ return -ENOMEM;
2205+
2206+ /*
2207+ * CAP_DAC_OVERRIDE for lookup and unlink
2208+ * CAP_SYS_ADMIN for setxattr of "trusted" namespace
2209+ * CAP_FOWNER for unlink in sticky directory
2210+ */
2211+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
2212+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2213+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
2214+ old_cred = override_creds(override_cred);
2215+
2216+ err = vfs_setxattr(upperdir, ovl_opaque_xattr, "y", 1, 0);
2217+ if (err)
2218+ goto out_revert_creds;
2219+
2220+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
2221+ list_for_each_entry(p, list, l_node) {
2222+ struct dentry *dentry;
2223+ int ret;
2224+
2225+ if (!p->is_whiteout)
2226+ continue;
2227+
2228+ dentry = lookup_one_len(p->name, upperdir, p->len);
2229+ if (IS_ERR(dentry)) {
2230+ printk(KERN_WARNING "overlayfs: failed to lookup whiteout %.*s: %li\n", p->len, p->name, PTR_ERR(dentry));
2231+ continue;
2232+ }
2233+ ret = vfs_unlink(upperdir->d_inode, dentry);
2234+ dput(dentry);
2235+ if (ret)
2236+ printk(KERN_WARNING "overlayfs: failed to unlink whiteout %.*s: %i\n", p->len, p->name, ret);
2237+ }
2238+ mutex_unlock(&upperdir->d_inode->i_mutex);
2239+
2240+out_revert_creds:
2241+ revert_creds(old_cred);
2242+ put_cred(override_cred);
2243+
2244+ return err;
2245+}
2246+
2247+int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type)
2248+{
2249+ int err;
2250+ LIST_HEAD(list);
2251+
2252+ err = ovl_check_empty_dir(dentry, &list);
2253+ if (!err && type == OVL_PATH_MERGE)
2254+ err = ovl_remove_whiteouts(dentry, &list);
2255+
2256+ ovl_cache_free(&list);
2257+
2258+ return err;
2259+}
2260--- /dev/null
2261+++ b/fs/overlayfs/super.c
2262@@ -0,0 +1,625 @@
2263+/*
2264+ *
2265+ * Copyright (C) 2011 Novell Inc.
2266+ *
2267+ * This program is free software; you can redistribute it and/or modify it
2268+ * under the terms of the GNU General Public License version 2 as published by
2269+ * the Free Software Foundation.
2270+ */
2271+
2272+#include <linux/fs.h>
2273+#include <linux/namei.h>
2274+#include <linux/xattr.h>
2275+#include <linux/security.h>
2276+#include <linux/mount.h>
2277+#include <linux/slab.h>
2278+#include <linux/parser.h>
2279+#include <linux/module.h>
2280+#include <linux/seq_file.h>
2281+#include "overlayfs.h"
2282+
2283+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
2284+MODULE_DESCRIPTION("Overlay filesystem");
2285+MODULE_LICENSE("GPL");
2286+
2287+struct ovl_config {
2288+ char *lowerdir;
2289+ char *upperdir;
2290+};
2291+
2292+/* private information held for overlayfs's superblock */
2293+struct ovl_fs {
2294+ struct vfsmount *upper_mnt;
2295+ struct vfsmount *lower_mnt;
2296+ /* pathnames of lower and upper dirs, for show_options */
2297+ struct ovl_config config;
2298+};
2299+
2300+/* private information held for every overlayfs dentry */
2301+struct ovl_entry {
2302+ /*
2303+ * Keep "double reference" on upper dentries, so that
2304+ * d_delete() doesn't think it's OK to reset d_inode to NULL.
2305+ */
2306+ struct dentry *__upperdentry;
2307+ struct dentry *lowerdentry;
2308+ union {
2309+ struct {
2310+ u64 version;
2311+ bool opaque;
2312+ };
2313+ struct rcu_head rcu;
2314+ };
2315+};
2316+
2317+const char *ovl_whiteout_xattr = "trusted.overlay.whiteout";
2318+const char *ovl_opaque_xattr = "trusted.overlay.opaque";
2319+
2320+
2321+enum ovl_path_type ovl_path_type(struct dentry *dentry)
2322+{
2323+ struct ovl_entry *oe = dentry->d_fsdata;
2324+
2325+ if (oe->__upperdentry) {
2326+ if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode))
2327+ return OVL_PATH_MERGE;
2328+ else
2329+ return OVL_PATH_UPPER;
2330+ } else {
2331+ return OVL_PATH_LOWER;
2332+ }
2333+}
2334+
2335+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
2336+{
2337+ struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
2338+ smp_read_barrier_depends();
2339+ return upperdentry;
2340+}
2341+
2342+void ovl_path_upper(struct dentry *dentry, struct path *path)
2343+{
2344+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
2345+ struct ovl_entry *oe = dentry->d_fsdata;
2346+
2347+ path->mnt = ofs->upper_mnt;
2348+ path->dentry = ovl_upperdentry_dereference(oe);
2349+}
2350+
2351+void ovl_path_lower(struct dentry *dentry, struct path *path)
2352+{
2353+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
2354+ struct ovl_entry *oe = dentry->d_fsdata;
2355+
2356+ path->mnt = ofs->lower_mnt;
2357+ path->dentry = oe->lowerdentry;
2358+}
2359+
2360+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
2361+{
2362+
2363+ enum ovl_path_type type = ovl_path_type(dentry);
2364+
2365+ if (type == OVL_PATH_LOWER)
2366+ ovl_path_lower(dentry, path);
2367+ else
2368+ ovl_path_upper(dentry, path);
2369+
2370+ return type;
2371+}
2372+
2373+struct dentry *ovl_dentry_upper(struct dentry *dentry)
2374+{
2375+ struct ovl_entry *oe = dentry->d_fsdata;
2376+
2377+ return ovl_upperdentry_dereference(oe);
2378+}
2379+
2380+struct dentry *ovl_dentry_lower(struct dentry *dentry)
2381+{
2382+ struct ovl_entry *oe = dentry->d_fsdata;
2383+
2384+ return oe->lowerdentry;
2385+}
2386+
2387+struct dentry *ovl_dentry_real(struct dentry *dentry)
2388+{
2389+ struct ovl_entry *oe = dentry->d_fsdata;
2390+ struct dentry *realdentry;
2391+
2392+ realdentry = ovl_upperdentry_dereference(oe);
2393+ if (!realdentry)
2394+ realdentry = oe->lowerdentry;
2395+
2396+ return realdentry;
2397+}
2398+
2399+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
2400+{
2401+ struct dentry *realdentry;
2402+
2403+ realdentry = ovl_upperdentry_dereference(oe);
2404+ if (realdentry) {
2405+ *is_upper = true;
2406+ } else {
2407+ realdentry = oe->lowerdentry;
2408+ *is_upper = false;
2409+ }
2410+ return realdentry;
2411+}
2412+
2413+bool ovl_dentry_is_opaque(struct dentry *dentry)
2414+{
2415+ struct ovl_entry *oe = dentry->d_fsdata;
2416+ return oe->opaque;
2417+}
2418+
2419+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
2420+{
2421+ struct ovl_entry *oe = dentry->d_fsdata;
2422+ oe->opaque = opaque;
2423+}
2424+
2425+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
2426+{
2427+ struct ovl_entry *oe = dentry->d_fsdata;
2428+
2429+ WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
2430+ WARN_ON(oe->__upperdentry);
2431+ BUG_ON(!upperdentry->d_inode);
2432+ smp_wmb();
2433+ oe->__upperdentry = dget(upperdentry);
2434+}
2435+
2436+void ovl_dentry_version_inc(struct dentry *dentry)
2437+{
2438+ struct ovl_entry *oe = dentry->d_fsdata;
2439+
2440+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2441+ oe->version++;
2442+}
2443+
2444+u64 ovl_dentry_version_get(struct dentry *dentry)
2445+{
2446+ struct ovl_entry *oe = dentry->d_fsdata;
2447+
2448+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2449+ return oe->version;
2450+}
2451+
2452+bool ovl_is_whiteout(struct dentry *dentry)
2453+{
2454+ int res;
2455+ char val;
2456+
2457+ if (!dentry)
2458+ return false;
2459+ if (!dentry->d_inode)
2460+ return false;
2461+ if (!S_ISLNK(dentry->d_inode->i_mode))
2462+ return false;
2463+
2464+ res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1);
2465+ if (res == 1 && val == 'y')
2466+ return true;
2467+
2468+ return false;
2469+}
2470+
2471+static bool ovl_is_opaquedir(struct dentry *dentry)
2472+{
2473+ int res;
2474+ char val;
2475+
2476+ if (!S_ISDIR(dentry->d_inode->i_mode))
2477+ return false;
2478+
2479+ res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1);
2480+ if (res == 1 && val == 'y')
2481+ return true;
2482+
2483+ return false;
2484+}
2485+
2486+static void ovl_entry_free(struct rcu_head *head)
2487+{
2488+ struct ovl_entry *oe = container_of(head, struct ovl_entry, rcu);
2489+ kfree(oe);
2490+}
2491+
2492+static void ovl_dentry_release(struct dentry *dentry)
2493+{
2494+ struct ovl_entry *oe = dentry->d_fsdata;
2495+
2496+ if (oe) {
2497+ dput(oe->__upperdentry);
2498+ dput(oe->__upperdentry);
2499+ dput(oe->lowerdentry);
2500+ call_rcu(&oe->rcu, ovl_entry_free);
2501+ }
2502+}
2503+
2504+const struct dentry_operations ovl_dentry_operations = {
2505+ .d_release = ovl_dentry_release,
2506+};
2507+
2508+static struct ovl_entry *ovl_alloc_entry(void)
2509+{
2510+ return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
2511+}
2512+
2513+static struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name)
2514+{
2515+ struct dentry *dentry;
2516+
2517+ mutex_lock(&dir->d_inode->i_mutex);
2518+ dentry = lookup_one_len(name->name, dir, name->len);
2519+ mutex_unlock(&dir->d_inode->i_mutex);
2520+
2521+ if (IS_ERR(dentry)) {
2522+ if (PTR_ERR(dentry) == -ENOENT)
2523+ dentry = NULL;
2524+ } else if (!dentry->d_inode) {
2525+ dput(dentry);
2526+ dentry = NULL;
2527+ }
2528+ return dentry;
2529+}
2530+
2531+int ovl_do_lookup(struct dentry *dentry)
2532+{
2533+ struct ovl_entry *oe;
2534+ struct dentry *upperdir;
2535+ struct dentry *lowerdir;
2536+ struct dentry *upperdentry = NULL;
2537+ struct dentry *lowerdentry = NULL;
2538+ struct inode *inode = NULL;
2539+ int err;
2540+
2541+ err = -ENOMEM;
2542+ oe = ovl_alloc_entry();
2543+ if (!oe)
2544+ goto out;
2545+
2546+ upperdir = ovl_dentry_upper(dentry->d_parent);
2547+ lowerdir = ovl_dentry_lower(dentry->d_parent);
2548+
2549+ if (upperdir) {
2550+ upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
2551+ err = PTR_ERR(upperdentry);
2552+ if (IS_ERR(upperdentry))
2553+ goto out_put_dir;
2554+
2555+ if (lowerdir && upperdentry &&
2556+ (S_ISLNK(upperdentry->d_inode->i_mode) ||
2557+ S_ISDIR(upperdentry->d_inode->i_mode))) {
2558+ const struct cred *old_cred;
2559+ struct cred *override_cred;
2560+
2561+ err = -ENOMEM;
2562+ override_cred = prepare_creds();
2563+ if (!override_cred)
2564+ goto out_dput_upper;
2565+
2566+ /* CAP_SYS_ADMIN needed for getxattr */
2567+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2568+ old_cred = override_creds(override_cred);
2569+
2570+ if (ovl_is_opaquedir(upperdentry)) {
2571+ oe->opaque = true;
2572+ } else if (ovl_is_whiteout(upperdentry)) {
2573+ dput(upperdentry);
2574+ upperdentry = NULL;
2575+ oe->opaque = true;
2576+ }
2577+ revert_creds(old_cred);
2578+ put_cred(override_cred);
2579+ }
2580+ }
2581+ if (lowerdir && !oe->opaque) {
2582+ lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
2583+ err = PTR_ERR(lowerdentry);
2584+ if (IS_ERR(lowerdentry))
2585+ goto out_dput_upper;
2586+ }
2587+
2588+ if (lowerdentry && upperdentry &&
2589+ (!S_ISDIR(upperdentry->d_inode->i_mode) ||
2590+ !S_ISDIR(lowerdentry->d_inode->i_mode))) {
2591+ dput(lowerdentry);
2592+ lowerdentry = NULL;
2593+ oe->opaque = true;
2594+ }
2595+
2596+ if (lowerdentry || upperdentry) {
2597+ struct dentry *realdentry;
2598+
2599+ realdentry = upperdentry ? upperdentry : lowerdentry;
2600+ err = -ENOMEM;
2601+ inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, oe);
2602+ if (!inode)
2603+ goto out_dput;
2604+ }
2605+
2606+ if (upperdentry)
2607+ oe->__upperdentry = dget(upperdentry);
2608+
2609+ if (lowerdentry)
2610+ oe->lowerdentry = lowerdentry;
2611+
2612+ dentry->d_fsdata = oe;
2613+ dentry->d_op = &ovl_dentry_operations;
2614+ d_add(dentry, inode);
2615+
2616+ return 0;
2617+
2618+out_dput:
2619+ dput(lowerdentry);
2620+out_dput_upper:
2621+ dput(upperdentry);
2622+out_put_dir:
2623+ kfree(oe);
2624+out:
2625+ return err;
2626+}
2627+
2628+static void ovl_put_super(struct super_block *sb)
2629+{
2630+ struct ovl_fs *ufs = sb->s_fs_info;
2631+
2632+ if (!(sb->s_flags & MS_RDONLY))
2633+ mnt_drop_write(ufs->upper_mnt);
2634+
2635+ mntput(ufs->upper_mnt);
2636+ mntput(ufs->lower_mnt);
2637+
2638+ kfree(ufs->config.lowerdir);
2639+ kfree(ufs->config.upperdir);
2640+ kfree(ufs);
2641+}
2642+
2643+static int ovl_remount_fs(struct super_block *sb, int *flagsp, char *data)
2644+{
2645+ int flags = *flagsp;
2646+ struct ovl_fs *ufs = sb->s_fs_info;
2647+
2648+ /* When remounting rw or ro, we need to adjust the write access to the
2649+ * upper fs.
2650+ */
2651+ if (((flags ^ sb->s_flags) & MS_RDONLY) == 0)
2652+ /* No change to readonly status */
2653+ return 0;
2654+
2655+ if (flags & MS_RDONLY) {
2656+ mnt_drop_write(ufs->upper_mnt);
2657+ return 0;
2658+ } else
2659+ return mnt_want_write(ufs->upper_mnt);
2660+}
2661+
2662+/**
2663+ * ovl_statfs
2664+ * @sb: The overlayfs super block
2665+ * @buf: The struct kstatfs to fill in with stats
2666+ *
2667+ * Get the filesystem statistics. As writes always target the upper layer
2668+ * filesystem pass the statfs to the same filesystem.
2669+ */
2670+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
2671+{
2672+ struct dentry *root_dentry = dentry->d_sb->s_root;
2673+ struct path path;
2674+ ovl_path_upper(root_dentry, &path);
2675+
2676+ if (!path.dentry->d_sb->s_op->statfs)
2677+ return -ENOSYS;
2678+ return path.dentry->d_sb->s_op->statfs(path.dentry, buf);
2679+}
2680+
2681+/**
2682+ * ovl_show_options
2683+ *
2684+ * Prints the mount options for a given superblock.
2685+ * Returns zero; does not fail.
2686+ */
2687+static int ovl_show_options(struct seq_file *m, struct vfsmount *mnt)
2688+{
2689+ struct super_block *sb = mnt->mnt_sb;
2690+ struct ovl_fs *ufs = sb->s_fs_info;
2691+
2692+ seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
2693+ seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
2694+ return 0;
2695+}
2696+
2697+static const struct super_operations ovl_super_operations = {
2698+ .put_super = ovl_put_super,
2699+ .remount_fs = ovl_remount_fs,
2700+ .statfs = ovl_statfs,
2701+ .show_options = ovl_show_options,
2702+};
2703+
2704+enum {
2705+ Opt_lowerdir,
2706+ Opt_upperdir,
2707+ Opt_err,
2708+};
2709+
2710+static const match_table_t ovl_tokens = {
2711+ {Opt_lowerdir, "lowerdir=%s"},
2712+ {Opt_upperdir, "upperdir=%s"},
2713+ {Opt_err, NULL}
2714+};
2715+
2716+static int ovl_parse_opt(char *opt, struct ovl_config *config)
2717+{
2718+ char *p;
2719+
2720+ config->upperdir = NULL;
2721+ config->lowerdir = NULL;
2722+
2723+ while ((p = strsep(&opt, ",")) != NULL) {
2724+ int token;
2725+ substring_t args[MAX_OPT_ARGS];
2726+
2727+ if (!*p)
2728+ continue;
2729+
2730+ token = match_token(p, ovl_tokens, args);
2731+ switch (token) {
2732+ case Opt_upperdir:
2733+ kfree(config->upperdir);
2734+ config->upperdir = match_strdup(&args[0]);
2735+ if (!config->upperdir)
2736+ return -ENOMEM;
2737+ break;
2738+
2739+ case Opt_lowerdir:
2740+ kfree(config->lowerdir);
2741+ config->lowerdir = match_strdup(&args[0]);
2742+ if (!config->lowerdir)
2743+ return -ENOMEM;
2744+ break;
2745+
2746+ default:
2747+ return -EINVAL;
2748+ }
2749+ }
2750+ return 0;
2751+}
2752+
2753+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
2754+{
2755+ struct path lowerpath;
2756+ struct path upperpath;
2757+ struct inode *root_inode;
2758+ struct dentry *root_dentry;
2759+ struct ovl_entry *oe;
2760+ struct ovl_fs *ufs;
2761+ int err;
2762+
2763+ err = -ENOMEM;
2764+ ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL);
2765+ if (!ufs)
2766+ goto out;
2767+
2768+ err = ovl_parse_opt((char *) data, &ufs->config);
2769+ if (err)
2770+ goto out_free_ufs;
2771+
2772+ err = -EINVAL;
2773+ if (!ufs->config.upperdir || !ufs->config.lowerdir) {
2774+ printk(KERN_ERR "overlayfs: missing upperdir or lowerdir\n");
2775+ goto out_free_config;
2776+ }
2777+
2778+ oe = ovl_alloc_entry();
2779+ if (oe == NULL)
2780+ goto out_free_config;
2781+
2782+ root_inode = ovl_new_inode(sb, S_IFDIR, oe);
2783+ if (!root_inode)
2784+ goto out_free_oe;
2785+
2786+ err = kern_path(ufs->config.upperdir, LOOKUP_FOLLOW, &upperpath);
2787+ if (err)
2788+ goto out_put_root;
2789+
2790+ err = kern_path(ufs->config.lowerdir, LOOKUP_FOLLOW, &lowerpath);
2791+ if (err)
2792+ goto out_put_upperpath;
2793+
2794+ err = -ENOTDIR;
2795+ if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
2796+ !S_ISDIR(lowerpath.dentry->d_inode->i_mode))
2797+ goto out_put_lowerpath;
2798+
2799+ ufs->upper_mnt = clone_private_mount(&upperpath);
2800+ err = PTR_ERR(ufs->upper_mnt);
2801+ if (IS_ERR(ufs->upper_mnt)) {
2802+ printk(KERN_ERR "overlayfs: failed to clone upperpath\n");
2803+ goto out_put_lowerpath;
2804+ }
2805+
2806+ ufs->lower_mnt = clone_private_mount(&lowerpath);
2807+ err = PTR_ERR(ufs->lower_mnt);
2808+ if (IS_ERR(ufs->lower_mnt)) {
2809+ printk(KERN_ERR "overlayfs: failed to clone lowerpath\n");
2810+ goto out_put_upper_mnt;
2811+ }
2812+
2813+ if (!(sb->s_flags & MS_RDONLY)) {
2814+ err = mnt_want_write(ufs->upper_mnt);
2815+ if (err)
2816+ goto out_put_lower_mnt;
2817+ }
2818+
2819+ err = -ENOMEM;
2820+ root_dentry = d_alloc_root(root_inode);
2821+ if (!root_dentry)
2822+ goto out_drop_write;
2823+
2824+ mntput(upperpath.mnt);
2825+ mntput(lowerpath.mnt);
2826+
2827+ oe->__upperdentry = dget(upperpath.dentry);
2828+ oe->lowerdentry = lowerpath.dentry;
2829+
2830+ root_dentry->d_fsdata = oe;
2831+ root_dentry->d_op = &ovl_dentry_operations;
2832+
2833+ sb->s_op = &ovl_super_operations;
2834+ sb->s_root = root_dentry;
2835+ sb->s_fs_info = ufs;
2836+
2837+ return 0;
2838+
2839+out_drop_write:
2840+ if (!(sb->s_flags & MS_RDONLY))
2841+ mnt_drop_write(ufs->upper_mnt);
2842+out_put_lower_mnt:
2843+ mntput(ufs->lower_mnt);
2844+out_put_upper_mnt:
2845+ mntput(ufs->upper_mnt);
2846+out_put_lowerpath:
2847+ path_put(&lowerpath);
2848+out_put_upperpath:
2849+ path_put(&upperpath);
2850+out_put_root:
2851+ iput(root_inode);
2852+out_free_oe:
2853+ kfree(oe);
2854+out_free_config:
2855+ kfree(ufs->config.lowerdir);
2856+ kfree(ufs->config.upperdir);
2857+out_free_ufs:
2858+ kfree(ufs);
2859+out:
2860+ return err;
2861+}
2862+
2863+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
2864+ const char *dev_name, void *raw_data)
2865+{
2866+ return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
2867+}
2868+
2869+static struct file_system_type ovl_fs_type = {
2870+ .owner = THIS_MODULE,
2871+ .name = "overlayfs",
2872+ .mount = ovl_mount,
2873+ .kill_sb = kill_anon_super,
2874+};
2875+
2876+static int __init ovl_init(void)
2877+{
2878+ return register_filesystem(&ovl_fs_type);
2879+}
2880+
2881+static void __exit ovl_exit(void)
2882+{
2883+ unregister_filesystem(&ovl_fs_type);
2884+}
2885+
2886+module_init(ovl_init);
2887+module_exit(ovl_exit);
2888--- /dev/null
2889+++ b/Documentation/filesystems/overlayfs.txt
2890@@ -0,0 +1,167 @@
2891+Written by: Neil Brown <neilb@suse.de>
2892+
2893+Overlay Filesystem
2894+==================
2895+
2896+This document describes a prototype for a new approach to providing
2897+overlay-filesystem functionality in Linux (sometimes referred to as
2898+union-filesystems). An overlay-filesystem tries to present a
2899+filesystem which is the result over overlaying one filesystem on top
2900+of the other.
2901+
2902+The result will inevitably fail to look exactly like a normal
2903+filesystem for various technical reasons. The expectation is that
2904+many use cases will be able to ignore these differences.
2905+
2906+This approach is 'hybrid' because the objects that appear in the
2907+filesystem do not all appear to belong to that filesystem. In many
2908+cases an object accessed in the union will be indistinguishable
2909+from accessing the corresponding object from the original filesystem.
2910+This is most obvious from the 'st_dev' field returned by stat(2).
2911+
2912+While directories will report an st_dev from the overlay-filesystem,
2913+all non-directory objects will report an st_dev from the lower or
2914+upper filesystem that is providing the object. Similarly st_ino will
2915+only be unique when combined with st_dev, and both of these can change
2916+over the lifetime of a non-directory object. Many applications and
2917+tools ignore these values and will not be affected.
2918+
2919+Upper and Lower
2920+---------------
2921+
2922+An overlay filesystem combines two filesystems - an 'upper' filesystem
2923+and a 'lower' filesystem. When a name exists in both filesystems, the
2924+object in the 'upper' filesystem is visible while the object in the
2925+'lower' filesystem is either hidden or, in the case of directories,
2926+merged with the 'upper' object.
2927+
2928+It would be more correct to refer to an upper and lower 'directory
2929+tree' rather than 'filesystem' as it is quite possible for both
2930+directory trees to be in the same filesystem and there is no
2931+requirement that the root of a filesystem be given for either upper or
2932+lower.
2933+
2934+The lower filesystem can be any filesystem supported by Linux and does
2935+not need to be writable. The lower filesystem can even be another
2936+overlayfs. The upper filesystem will normally be writable and if it
2937+is it must support the creation of trusted.* extended attributes, and
2938+must provide valid d_type in readdir responses, at least for symbolic
2939+links - so NFS is not suitable.
2940+
2941+A read-only overlay of two read-only filesystems may use any
2942+filesystem type.
2943+
2944+Directories
2945+-----------
2946+
2947+Overlaying mainly involved directories. If a given name appears in both
2948+upper and lower filesystems and refers to a non-directory in either,
2949+then the lower object is hidden - the name refers only to the upper
2950+object.
2951+
2952+Where both upper and lower objects are directories, a merged directory
2953+is formed.
2954+
2955+At mount time, the two directories given as mount options are combined
2956+into a merged directory:
2957+
2958+ mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper /overlay
2959+
2960+Then whenever a lookup is requested in such a merged directory, the
2961+lookup is performed in each actual directory and the combined result
2962+is cached in the dentry belonging to the overlay filesystem. If both
2963+actual lookups find directories, both are stored and a merged
2964+directory is created, otherwise only one is stored: the upper if it
2965+exists, else the lower.
2966+
2967+Only the lists of names from directories are merged. Other content
2968+such as metadata and extended attributes are reported for the upper
2969+directory only. These attributes of the lower directory are hidden.
2970+
2971+whiteouts and opaque directories
2972+--------------------------------
2973+
2974+In order to support rm and rmdir without changing the lower
2975+filesystem, an overlay filesystem needs to record in the upper filesystem
2976+that files have been removed. This is done using whiteouts and opaque
2977+directories (non-directories are always opaque).
2978+
2979+The overlay filesystem uses extended attributes with a
2980+"trusted.overlay." prefix to record these details.
2981+
2982+A whiteout is created as a symbolic link with target
2983+"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y".
2984+When a whiteout is found in the upper level of a merged directory, any
2985+matching name in the lower level is ignored, and the whiteout itself
2986+is also hidden.
2987+
2988+A directory is made opaque by setting the xattr "trusted.overlay.opaque"
2989+to "y". Where the upper filesystem contains an opaque directory, any
2990+directory in the lower filesystem with the same name is ignored.
2991+
2992+readdir
2993+-------
2994+
2995+When a 'readdir' request is made on a merged directory, the upper and
2996+lower directories are each read and the name lists merged in the
2997+obvious way (upper is read first, then lower - entries that already
2998+exist are not re-added). This merged name list is cached in the
2999+'struct file' and so remains as long as the file is kept open. If the
3000+directory is opened and read by two processes at the same time, they
3001+will each have separate caches. A seekdir to the start of the
3002+directory (offset 0) followed by a readdir will cause the cache to be
3003+discarded and rebuilt.
3004+
3005+This means that changes to the merged directory do not appear while a
3006+directory is being read. This is unlikely to be noticed by many
3007+programs.
3008+
3009+seek offsets are assigned sequentially when the directories are read.
3010+Thus if
3011+ - read part of a directory
3012+ - remember an offset, and close the directory
3013+ - re-open the directory some time later
3014+ - seek to the remembered offset
3015+
3016+there may be little correlation between the old and new locations in
3017+the list of filenames, particularly if anything has changed in the
3018+directory.
3019+
3020+Readdir on directories that are not merged is simply handled by the
3021+underlying directory (upper or lower).
3022+
3023+
3024+Non-directories
3025+---------------
3026+
3027+Objects that are not directories (files, symlinks, device-special
3028+files etc.) are presented either from the upper or lower filesystem as
3029+appropriate. When a file in the lower filesystem is accessed in a way
3030+the requires write-access, such as opening for write access, changing
3031+some metadata etc., the file is first copied from the lower filesystem
3032+to the upper filesystem (copy_up). Note that creating a hard-link
3033+also requires copy_up, though of course creation of a symlink does
3034+not.
3035+
3036+The copy_up process first makes sure that the containing directory
3037+exists in the upper filesystem - creating it and any parents as
3038+necessary. It then creates the object with the same metadata (owner,
3039+mode, mtime, symlink-target etc.) and then if the object is a file, the
3040+data is copied from the lower to the upper filesystem. Finally any
3041+extended attributes are copied up.
3042+
3043+Once the copy_up is complete, the overlay filesystem simply
3044+provides direct access to the newly created file in the upper
3045+filesystem - future operations on the file are barely noticed by the
3046+overlay filesystem (though an operation on the name of the file such as
3047+rename or unlink will of course be noticed and handled).
3048+
3049+Changes to underlying filesystems
3050+---------------------------------
3051+
3052+Offline changes, when the overlay is not mounted, are allowed to either
3053+the upper or the lower trees.
3054+
3055+Changes to the underlying filesystems while part of a mounted overlay
3056+filesystem are not allowed. This is not yet enforced, but will be in
3057+the future.
3058--- a/MAINTAINERS
3059+++ b/MAINTAINERS
3060@@ -4727,6 +4727,13 @@ F: drivers/scsi/osd/
3061 F: include/scsi/osd_*
3062 F: fs/exofs/
3063 
3064+OVERLAYFS FILESYSTEM
3065+M: Miklos Szeredi <miklos@szeredi.hu>
3066+L: linux-fsdevel@vger.kernel.org
3067+S: Supported
3068+F: fs/overlayfs/*
3069+F: Documentation/filesystems/overlayfs.txt
3070+
3071 P54 WIRELESS DRIVER
3072 M: Christian Lamparter <chunkeey@googlemail.com>
3073 L: linux-wireless@vger.kernel.org
3074

Archive Download this file



interactive