Root/target/linux/generic/patches-3.1/100-overlayfs_v11.patch

1--- /dev/null
2+++ b/Documentation/filesystems/overlayfs.txt
3@@ -0,0 +1,199 @@
4+Written by: Neil Brown <neilb@suse.de>
5+
6+Overlay Filesystem
7+==================
8+
9+This document describes a prototype for a new approach to providing
10+overlay-filesystem functionality in Linux (sometimes referred to as
11+union-filesystems). An overlay-filesystem tries to present a
12+filesystem which is the result over overlaying one filesystem on top
13+of the other.
14+
15+The result will inevitably fail to look exactly like a normal
16+filesystem for various technical reasons. The expectation is that
17+many use cases will be able to ignore these differences.
18+
19+This approach is 'hybrid' because the objects that appear in the
20+filesystem do not all appear to belong to that filesystem. In many
21+cases an object accessed in the union will be indistinguishable
22+from accessing the corresponding object from the original filesystem.
23+This is most obvious from the 'st_dev' field returned by stat(2).
24+
25+While directories will report an st_dev from the overlay-filesystem,
26+all non-directory objects will report an st_dev from the lower or
27+upper filesystem that is providing the object. Similarly st_ino will
28+only be unique when combined with st_dev, and both of these can change
29+over the lifetime of a non-directory object. Many applications and
30+tools ignore these values and will not be affected.
31+
32+Upper and Lower
33+---------------
34+
35+An overlay filesystem combines two filesystems - an 'upper' filesystem
36+and a 'lower' filesystem. When a name exists in both filesystems, the
37+object in the 'upper' filesystem is visible while the object in the
38+'lower' filesystem is either hidden or, in the case of directories,
39+merged with the 'upper' object.
40+
41+It would be more correct to refer to an upper and lower 'directory
42+tree' rather than 'filesystem' as it is quite possible for both
43+directory trees to be in the same filesystem and there is no
44+requirement that the root of a filesystem be given for either upper or
45+lower.
46+
47+The lower filesystem can be any filesystem supported by Linux and does
48+not need to be writable. The lower filesystem can even be another
49+overlayfs. The upper filesystem will normally be writable and if it
50+is it must support the creation of trusted.* extended attributes, and
51+must provide valid d_type in readdir responses, at least for symbolic
52+links - so NFS is not suitable.
53+
54+A read-only overlay of two read-only filesystems may use any
55+filesystem type.
56+
57+Directories
58+-----------
59+
60+Overlaying mainly involved directories. If a given name appears in both
61+upper and lower filesystems and refers to a non-directory in either,
62+then the lower object is hidden - the name refers only to the upper
63+object.
64+
65+Where both upper and lower objects are directories, a merged directory
66+is formed.
67+
68+At mount time, the two directories given as mount options are combined
69+into a merged directory:
70+
71+ mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper /overlay
72+
73+Then whenever a lookup is requested in such a merged directory, the
74+lookup is performed in each actual directory and the combined result
75+is cached in the dentry belonging to the overlay filesystem. If both
76+actual lookups find directories, both are stored and a merged
77+directory is created, otherwise only one is stored: the upper if it
78+exists, else the lower.
79+
80+Only the lists of names from directories are merged. Other content
81+such as metadata and extended attributes are reported for the upper
82+directory only. These attributes of the lower directory are hidden.
83+
84+whiteouts and opaque directories
85+--------------------------------
86+
87+In order to support rm and rmdir without changing the lower
88+filesystem, an overlay filesystem needs to record in the upper filesystem
89+that files have been removed. This is done using whiteouts and opaque
90+directories (non-directories are always opaque).
91+
92+The overlay filesystem uses extended attributes with a
93+"trusted.overlay." prefix to record these details.
94+
95+A whiteout is created as a symbolic link with target
96+"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y".
97+When a whiteout is found in the upper level of a merged directory, any
98+matching name in the lower level is ignored, and the whiteout itself
99+is also hidden.
100+
101+A directory is made opaque by setting the xattr "trusted.overlay.opaque"
102+to "y". Where the upper filesystem contains an opaque directory, any
103+directory in the lower filesystem with the same name is ignored.
104+
105+readdir
106+-------
107+
108+When a 'readdir' request is made on a merged directory, the upper and
109+lower directories are each read and the name lists merged in the
110+obvious way (upper is read first, then lower - entries that already
111+exist are not re-added). This merged name list is cached in the
112+'struct file' and so remains as long as the file is kept open. If the
113+directory is opened and read by two processes at the same time, they
114+will each have separate caches. A seekdir to the start of the
115+directory (offset 0) followed by a readdir will cause the cache to be
116+discarded and rebuilt.
117+
118+This means that changes to the merged directory do not appear while a
119+directory is being read. This is unlikely to be noticed by many
120+programs.
121+
122+seek offsets are assigned sequentially when the directories are read.
123+Thus if
124+ - read part of a directory
125+ - remember an offset, and close the directory
126+ - re-open the directory some time later
127+ - seek to the remembered offset
128+
129+there may be little correlation between the old and new locations in
130+the list of filenames, particularly if anything has changed in the
131+directory.
132+
133+Readdir on directories that are not merged is simply handled by the
134+underlying directory (upper or lower).
135+
136+
137+Non-directories
138+---------------
139+
140+Objects that are not directories (files, symlinks, device-special
141+files etc.) are presented either from the upper or lower filesystem as
142+appropriate. When a file in the lower filesystem is accessed in a way
143+the requires write-access, such as opening for write access, changing
144+some metadata etc., the file is first copied from the lower filesystem
145+to the upper filesystem (copy_up). Note that creating a hard-link
146+also requires copy_up, though of course creation of a symlink does
147+not.
148+
149+The copy_up may turn out to be unnecessary, for example if the file is
150+opened for read-write but the data is not modified.
151+
152+The copy_up process first makes sure that the containing directory
153+exists in the upper filesystem - creating it and any parents as
154+necessary. It then creates the object with the same metadata (owner,
155+mode, mtime, symlink-target etc.) and then if the object is a file, the
156+data is copied from the lower to the upper filesystem. Finally any
157+extended attributes are copied up.
158+
159+Once the copy_up is complete, the overlay filesystem simply
160+provides direct access to the newly created file in the upper
161+filesystem - future operations on the file are barely noticed by the
162+overlay filesystem (though an operation on the name of the file such as
163+rename or unlink will of course be noticed and handled).
164+
165+
166+Non-standard behavior
167+---------------------
168+
169+The copy_up operation essentially creates a new, identical file and
170+moves it over to the old name. The new file may be on a different
171+filesystem, so both st_dev and st_ino of the file may change.
172+
173+Any open files referring to this inode will access the old data and
174+metadata. Similarly any file locks obtained before copy_up will not
175+apply to the copied up file.
176+
177+On a file is opened with O_RDONLY fchmod(2), fchown(2), futimesat(2)
178+and fsetxattr(2) will fail with EROFS.
179+
180+If a file with multiple hard links is copied up, then this will
181+"break" the link. Changes will not be propagated to other names
182+referring to the same inode.
183+
184+Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory
185+object in overlayfs will not contain vaid absolute paths, only
186+relative paths leading up to the filesystem's root. This will be
187+fixed in the future.
188+
189+Some operations are not atomic, for example a crash during copy_up or
190+rename will leave the filesystem in an inconsitent state. This will
191+be addressed in the future.
192+
193+Changes to underlying filesystems
194+---------------------------------
195+
196+Offline changes, when the overlay is not mounted, are allowed to either
197+the upper or the lower trees.
198+
199+Changes to the underlying filesystems while part of a mounted overlay
200+filesystem are not allowed. If the underlying filesystem is changed,
201+the behavior of the overlay is undefined, though it will not result in
202+a crash or deadlock.
203--- a/MAINTAINERS
204+++ b/MAINTAINERS
205@@ -4783,6 +4783,13 @@ F: drivers/scsi/osd/
206 F: include/scsi/osd_*
207 F: fs/exofs/
208 
209+OVERLAYFS FILESYSTEM
210+M: Miklos Szeredi <miklos@szeredi.hu>
211+L: linux-fsdevel@vger.kernel.org
212+S: Supported
213+F: fs/overlayfs/*
214+F: Documentation/filesystems/overlayfs.txt
215+
216 P54 WIRELESS DRIVER
217 M: Christian Lamparter <chunkeey@googlemail.com>
218 L: linux-wireless@vger.kernel.org
219--- a/fs/Kconfig
220+++ b/fs/Kconfig
221@@ -63,6 +63,7 @@ source "fs/quota/Kconfig"
222 
223 source "fs/autofs4/Kconfig"
224 source "fs/fuse/Kconfig"
225+source "fs/overlayfs/Kconfig"
226 
227 config CUSE
228     tristate "Character device in Userspace support"
229--- a/fs/Makefile
230+++ b/fs/Makefile
231@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX4FS_FS) += qnx4/
232 obj-$(CONFIG_AUTOFS4_FS) += autofs4/
233 obj-$(CONFIG_ADFS_FS) += adfs/
234 obj-$(CONFIG_FUSE_FS) += fuse/
235+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
236 obj-$(CONFIG_UDF_FS) += udf/
237 obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
238 obj-$(CONFIG_OMFS_FS) += omfs/
239--- a/fs/ecryptfs/main.c
240+++ b/fs/ecryptfs/main.c
241@@ -544,6 +544,13 @@ static struct dentry *ecryptfs_mount(str
242     s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
243     s->s_blocksize = path.dentry->d_sb->s_blocksize;
244     s->s_magic = ECRYPTFS_SUPER_MAGIC;
245+ s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
246+
247+ rc = -EINVAL;
248+ if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
249+ printk(KERN_ERR "eCryptfs: maximum fs stacking depth exceeded\n");
250+ goto out_free;
251+ }
252 
253     inode = ecryptfs_get_inode(path.dentry->d_inode, s);
254     rc = PTR_ERR(inode);
255--- a/fs/namespace.c
256+++ b/fs/namespace.c
257@@ -1494,6 +1494,23 @@ void drop_collected_mounts(struct vfsmou
258     release_mounts(&umount_list);
259 }
260 
261+struct vfsmount *clone_private_mount(struct path *path)
262+{
263+ struct vfsmount *mnt;
264+
265+ if (IS_MNT_UNBINDABLE(path->mnt))
266+ return ERR_PTR(-EINVAL);
267+
268+ down_read(&namespace_sem);
269+ mnt = clone_mnt(path->mnt, path->dentry, CL_PRIVATE);
270+ up_read(&namespace_sem);
271+ if (!mnt)
272+ return ERR_PTR(-ENOMEM);
273+
274+ return mnt;
275+}
276+EXPORT_SYMBOL_GPL(clone_private_mount);
277+
278 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
279            struct vfsmount *root)
280 {
281--- a/fs/open.c
282+++ b/fs/open.c
283@@ -644,8 +644,7 @@ static inline int __get_file_write_acces
284     return error;
285 }
286 
287-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
288- struct file *f,
289+static struct file *__dentry_open(struct path *path, struct file *f,
290                     int (*open)(struct inode *, struct file *),
291                     const struct cred *cred)
292 {
293@@ -653,15 +652,16 @@ static struct file *__dentry_open(struct
294     struct inode *inode;
295     int error;
296 
297+ path_get(path);
298     f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
299                 FMODE_PREAD | FMODE_PWRITE;
300 
301     if (unlikely(f->f_flags & O_PATH))
302         f->f_mode = FMODE_PATH;
303 
304- inode = dentry->d_inode;
305+ inode = path->dentry->d_inode;
306     if (f->f_mode & FMODE_WRITE) {
307- error = __get_file_write_access(inode, mnt);
308+ error = __get_file_write_access(inode, path->mnt);
309         if (error)
310             goto cleanup_file;
311         if (!special_file(inode->i_mode))
312@@ -669,8 +669,7 @@ static struct file *__dentry_open(struct
313     }
314 
315     f->f_mapping = inode->i_mapping;
316- f->f_path.dentry = dentry;
317- f->f_path.mnt = mnt;
318+ f->f_path = *path;
319     f->f_pos = 0;
320     file_sb_list_add(f, inode->i_sb);
321 
322@@ -723,7 +722,7 @@ cleanup_all:
323              * here, so just reset the state.
324              */
325             file_reset_write(f);
326- mnt_drop_write(mnt);
327+ mnt_drop_write(path->mnt);
328         }
329     }
330     file_sb_list_del(f);
331@@ -731,8 +730,7 @@ cleanup_all:
332     f->f_path.mnt = NULL;
333 cleanup_file:
334     put_filp(f);
335- dput(dentry);
336- mntput(mnt);
337+ path_put(path);
338     return ERR_PTR(error);
339 }
340 
341@@ -758,14 +756,14 @@ cleanup_file:
342 struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
343         int (*open)(struct inode *, struct file *))
344 {
345+ struct path path = { .dentry = dentry, .mnt = nd->path.mnt };
346     const struct cred *cred = current_cred();
347 
348     if (IS_ERR(nd->intent.open.file))
349         goto out;
350     if (IS_ERR(dentry))
351         goto out_err;
352- nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
353- nd->intent.open.file,
354+ nd->intent.open.file = __dentry_open(&path, nd->intent.open.file,
355                          open, cred);
356 out:
357     return nd->intent.open.file;
358@@ -794,10 +792,17 @@ struct file *nameidata_to_filp(struct na
359 
360     /* Has the filesystem initialised the file for us? */
361     if (filp->f_path.dentry == NULL) {
362- path_get(&nd->path);
363- filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
364- NULL, cred);
365+ struct inode *inode = nd->path.dentry->d_inode;
366+
367+ if (inode->i_op->open) {
368+ int flags = filp->f_flags;
369+ put_filp(filp);
370+ filp = inode->i_op->open(nd->path.dentry, flags, cred);
371+ } else {
372+ filp = __dentry_open(&nd->path, filp, NULL, cred);
373+ }
374     }
375+
376     return filp;
377 }
378 
379@@ -808,26 +813,45 @@ struct file *nameidata_to_filp(struct na
380 struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
381              const struct cred *cred)
382 {
383- int error;
384- struct file *f;
385-
386- validate_creds(cred);
387+ struct path path = { .dentry = dentry, .mnt = mnt };
388+ struct file *ret;
389 
390     /* We must always pass in a valid mount pointer. */
391     BUG_ON(!mnt);
392 
393- error = -ENFILE;
394+ ret = vfs_open(&path, flags, cred);
395+ path_put(&path);
396+
397+ return ret;
398+}
399+EXPORT_SYMBOL(dentry_open);
400+
401+/**
402+ * vfs_open - open the file at the given path
403+ * @path: path to open
404+ * @flags: open flags
405+ * @cred: credentials to use
406+ *
407+ * Open the file. If successful, the returned file will have acquired
408+ * an additional reference for path.
409+ */
410+struct file *vfs_open(struct path *path, int flags, const struct cred *cred)
411+{
412+ struct file *f;
413+ struct inode *inode = path->dentry->d_inode;
414+
415+ validate_creds(cred);
416+
417+ if (inode->i_op->open)
418+ return inode->i_op->open(path->dentry, flags, cred);
419     f = get_empty_filp();
420- if (f == NULL) {
421- dput(dentry);
422- mntput(mnt);
423- return ERR_PTR(error);
424- }
425+ if (f == NULL)
426+ return ERR_PTR(-ENFILE);
427 
428     f->f_flags = flags;
429- return __dentry_open(dentry, mnt, f, NULL, cred);
430+ return __dentry_open(path, f, NULL, cred);
431 }
432-EXPORT_SYMBOL(dentry_open);
433+EXPORT_SYMBOL(vfs_open);
434 
435 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
436 {
437--- /dev/null
438+++ b/fs/overlayfs/Kconfig
439@@ -0,0 +1,4 @@
440+config OVERLAYFS_FS
441+ tristate "Overlay filesystem support"
442+ help
443+ Add support for overlay filesystem.
444--- /dev/null
445+++ b/fs/overlayfs/Makefile
446@@ -0,0 +1,7 @@
447+#
448+# Makefile for the overlay filesystem.
449+#
450+
451+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
452+
453+overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
454--- /dev/null
455+++ b/fs/overlayfs/copy_up.c
456@@ -0,0 +1,383 @@
457+/*
458+ *
459+ * Copyright (C) 2011 Novell Inc.
460+ *
461+ * This program is free software; you can redistribute it and/or modify it
462+ * under the terms of the GNU General Public License version 2 as published by
463+ * the Free Software Foundation.
464+ */
465+
466+#include <linux/fs.h>
467+#include <linux/slab.h>
468+#include <linux/file.h>
469+#include <linux/splice.h>
470+#include <linux/xattr.h>
471+#include <linux/security.h>
472+#include <linux/uaccess.h>
473+#include "overlayfs.h"
474+
475+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
476+
477+static int ovl_copy_up_xattr(struct dentry *old, struct dentry *new)
478+{
479+ ssize_t list_size, size;
480+ char *buf, *name, *value;
481+ int error;
482+
483+ if (!old->d_inode->i_op->getxattr ||
484+ !new->d_inode->i_op->getxattr)
485+ return 0;
486+
487+ list_size = vfs_listxattr(old, NULL, 0);
488+ if (list_size <= 0) {
489+ if (list_size == -EOPNOTSUPP)
490+ return 0;
491+ return list_size;
492+ }
493+
494+ buf = kzalloc(list_size, GFP_KERNEL);
495+ if (!buf)
496+ return -ENOMEM;
497+
498+ error = -ENOMEM;
499+ value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
500+ if (!value)
501+ goto out;
502+
503+ list_size = vfs_listxattr(old, buf, list_size);
504+ if (list_size <= 0) {
505+ error = list_size;
506+ goto out_free_value;
507+ }
508+
509+ for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
510+ size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
511+ if (size <= 0) {
512+ error = size;
513+ goto out_free_value;
514+ }
515+ error = vfs_setxattr(new, name, value, size, 0);
516+ if (error)
517+ goto out_free_value;
518+ }
519+
520+out_free_value:
521+ kfree(value);
522+out:
523+ kfree(buf);
524+ return error;
525+}
526+
527+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
528+{
529+ struct file *old_file;
530+ struct file *new_file;
531+ int error = 0;
532+
533+ if (len == 0)
534+ return 0;
535+
536+ old_file = vfs_open(old, O_RDONLY, current_cred());
537+ if (IS_ERR(old_file))
538+ return PTR_ERR(old_file);
539+
540+ new_file = vfs_open(new, O_WRONLY, current_cred());
541+ if (IS_ERR(new_file)) {
542+ error = PTR_ERR(new_file);
543+ goto out_fput;
544+ }
545+
546+ /* FIXME: copy up sparse files efficiently */
547+ while (len) {
548+ loff_t offset = new_file->f_pos;
549+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
550+ long bytes;
551+
552+ if (len < this_len)
553+ this_len = len;
554+
555+ if (signal_pending_state(TASK_KILLABLE, current)) {
556+ error = -EINTR;
557+ break;
558+ }
559+
560+ bytes = do_splice_direct(old_file, &offset, new_file, this_len,
561+ SPLICE_F_MOVE);
562+ if (bytes <= 0) {
563+ error = bytes;
564+ break;
565+ }
566+
567+ len -= bytes;
568+ }
569+
570+ fput(new_file);
571+out_fput:
572+ fput(old_file);
573+ return error;
574+}
575+
576+static char *ovl_read_symlink(struct dentry *realdentry)
577+{
578+ int res;
579+ char *buf;
580+ struct inode *inode = realdentry->d_inode;
581+ mm_segment_t old_fs;
582+
583+ res = -EINVAL;
584+ if (!inode->i_op->readlink)
585+ goto err;
586+
587+ res = -ENOMEM;
588+ buf = (char *) __get_free_page(GFP_KERNEL);
589+ if (!buf)
590+ goto err;
591+
592+ old_fs = get_fs();
593+ set_fs(get_ds());
594+ /* The cast to a user pointer is valid due to the set_fs() */
595+ res = inode->i_op->readlink(realdentry,
596+ (char __user *)buf, PAGE_SIZE - 1);
597+ set_fs(old_fs);
598+ if (res < 0) {
599+ free_page((unsigned long) buf);
600+ goto err;
601+ }
602+ buf[res] = '\0';
603+
604+ return buf;
605+
606+err:
607+ return ERR_PTR(res);
608+}
609+
610+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
611+{
612+ struct iattr attr = {
613+ .ia_valid = ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
614+ .ia_atime = stat->atime,
615+ .ia_mtime = stat->mtime,
616+ };
617+
618+ return notify_change(upperdentry, &attr);
619+}
620+
621+static int ovl_set_mode(struct dentry *upperdentry, umode_t mode)
622+{
623+ struct iattr attr = {
624+ .ia_valid = ATTR_MODE,
625+ .ia_mode = mode,
626+ };
627+
628+ return notify_change(upperdentry, &attr);
629+}
630+
631+static int ovl_copy_up_locked(struct dentry *upperdir, struct dentry *dentry,
632+ struct path *lowerpath, struct kstat *stat,
633+ const char *link)
634+{
635+ int err;
636+ struct path newpath;
637+ umode_t mode = stat->mode;
638+
639+ /* Can't properly set mode on creation because of the umask */
640+ stat->mode &= S_IFMT;
641+
642+ ovl_path_upper(dentry, &newpath);
643+ WARN_ON(newpath.dentry);
644+ newpath.dentry = ovl_upper_create(upperdir, dentry, stat, link);
645+ if (IS_ERR(newpath.dentry))
646+ return PTR_ERR(newpath.dentry);
647+
648+ if (S_ISREG(stat->mode)) {
649+ err = ovl_copy_up_data(lowerpath, &newpath, stat->size);
650+ if (err)
651+ goto err_remove;
652+ }
653+
654+ err = ovl_copy_up_xattr(lowerpath->dentry, newpath.dentry);
655+ if (err)
656+ goto err_remove;
657+
658+ mutex_lock(&newpath.dentry->d_inode->i_mutex);
659+ if (!S_ISLNK(stat->mode))
660+ err = ovl_set_mode(newpath.dentry, mode);
661+ if (!err)
662+ err = ovl_set_timestamps(newpath.dentry, stat);
663+ mutex_unlock(&newpath.dentry->d_inode->i_mutex);
664+ if (err)
665+ goto err_remove;
666+
667+ ovl_dentry_update(dentry, newpath.dentry);
668+
669+ /*
670+ * Easiest way to get rid of the lower dentry reference is to
671+ * drop this dentry. This is neither needed nor possible for
672+ * directories.
673+ */
674+ if (!S_ISDIR(stat->mode))
675+ d_drop(dentry);
676+
677+ return 0;
678+
679+err_remove:
680+ if (S_ISDIR(stat->mode))
681+ vfs_rmdir(upperdir->d_inode, newpath.dentry);
682+ else
683+ vfs_unlink(upperdir->d_inode, newpath.dentry);
684+
685+ dput(newpath.dentry);
686+
687+ return err;
688+}
689+
690+/*
691+ * Copy up a single dentry
692+ *
693+ * Directory renames only allowed on "pure upper" (already created on
694+ * upper filesystem, never copied up). Directories which are on lower or
695+ * are merged may not be renamed. For these -EXDEV is returned and
696+ * userspace has to deal with it. This means, when copying up a
697+ * directory we can rely on it and ancestors being stable.
698+ *
699+ * Non-directory renames start with copy up of source if necessary. The
700+ * actual rename will only proceed once the copy up was successful. Copy
701+ * up uses upper parent i_mutex for exclusion. Since rename can change
702+ * d_parent it is possible that the copy up will lock the old parent. At
703+ * that point the file will have already been copied up anyway.
704+ */
705+static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
706+ struct path *lowerpath, struct kstat *stat)
707+{
708+ int err;
709+ struct kstat pstat;
710+ struct path parentpath;
711+ struct dentry *upperdir;
712+ const struct cred *old_cred;
713+ struct cred *override_cred;
714+ char *link = NULL;
715+
716+ ovl_path_upper(parent, &parentpath);
717+ upperdir = parentpath.dentry;
718+
719+ err = vfs_getattr(parentpath.mnt, parentpath.dentry, &pstat);
720+ if (err)
721+ return err;
722+
723+ if (S_ISLNK(stat->mode)) {
724+ link = ovl_read_symlink(lowerpath->dentry);
725+ if (IS_ERR(link))
726+ return PTR_ERR(link);
727+ }
728+
729+ err = -ENOMEM;
730+ override_cred = prepare_creds();
731+ if (!override_cred)
732+ goto out_free_link;
733+
734+ override_cred->fsuid = stat->uid;
735+ override_cred->fsgid = stat->gid;
736+ /*
737+ * CAP_SYS_ADMIN for copying up extended attributes
738+ * CAP_DAC_OVERRIDE for create
739+ * CAP_FOWNER for chmod, timestamp update
740+ * CAP_FSETID for chmod
741+ * CAP_MKNOD for mknod
742+ */
743+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
744+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
745+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
746+ cap_raise(override_cred->cap_effective, CAP_FSETID);
747+ cap_raise(override_cred->cap_effective, CAP_MKNOD);
748+ old_cred = override_creds(override_cred);
749+
750+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
751+ if (ovl_path_type(dentry) != OVL_PATH_LOWER) {
752+ err = 0;
753+ } else {
754+ err = ovl_copy_up_locked(upperdir, dentry, lowerpath,
755+ stat, link);
756+ if (!err) {
757+ /* Restore timestamps on parent (best effort) */
758+ ovl_set_timestamps(upperdir, &pstat);
759+ }
760+ }
761+
762+ mutex_unlock(&upperdir->d_inode->i_mutex);
763+
764+ revert_creds(old_cred);
765+ put_cred(override_cred);
766+
767+out_free_link:
768+ if (link)
769+ free_page((unsigned long) link);
770+
771+ return err;
772+}
773+
774+int ovl_copy_up(struct dentry *dentry)
775+{
776+ int err;
777+
778+ err = 0;
779+ while (!err) {
780+ struct dentry *next;
781+ struct dentry *parent;
782+ struct path lowerpath;
783+ struct kstat stat;
784+ enum ovl_path_type type = ovl_path_type(dentry);
785+
786+ if (type != OVL_PATH_LOWER)
787+ break;
788+
789+ next = dget(dentry);
790+ /* find the topmost dentry not yet copied up */
791+ for (;;) {
792+ parent = dget_parent(next);
793+
794+ type = ovl_path_type(parent);
795+ if (type != OVL_PATH_LOWER)
796+ break;
797+
798+ dput(next);
799+ next = parent;
800+ }
801+
802+ ovl_path_lower(next, &lowerpath);
803+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
804+ if (!err)
805+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
806+
807+ dput(parent);
808+ dput(next);
809+ }
810+
811+ return err;
812+}
813+
814+/* Optimize by not copying up the file first and truncating later */
815+int ovl_copy_up_truncate(struct dentry *dentry, loff_t size)
816+{
817+ int err;
818+ struct kstat stat;
819+ struct path lowerpath;
820+ struct dentry *parent = dget_parent(dentry);
821+
822+ err = ovl_copy_up(parent);
823+ if (err)
824+ goto out_dput_parent;
825+
826+ ovl_path_lower(dentry, &lowerpath);
827+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
828+ if (err)
829+ goto out_dput_parent;
830+
831+ if (size < stat.size)
832+ stat.size = size;
833+
834+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
835+
836+out_dput_parent:
837+ dput(parent);
838+ return err;
839+}
840--- /dev/null
841+++ b/fs/overlayfs/dir.c
842@@ -0,0 +1,596 @@
843+/*
844+ *
845+ * Copyright (C) 2011 Novell Inc.
846+ *
847+ * This program is free software; you can redistribute it and/or modify it
848+ * under the terms of the GNU General Public License version 2 as published by
849+ * the Free Software Foundation.
850+ */
851+
852+#include <linux/fs.h>
853+#include <linux/namei.h>
854+#include <linux/xattr.h>
855+#include <linux/security.h>
856+#include "overlayfs.h"
857+
858+static const char *ovl_whiteout_symlink = "(overlay-whiteout)";
859+
860+static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry)
861+{
862+ int err;
863+ struct dentry *newdentry;
864+ const struct cred *old_cred;
865+ struct cred *override_cred;
866+
867+ /* FIXME: recheck lower dentry to see if whiteout is really needed */
868+
869+ err = -ENOMEM;
870+ override_cred = prepare_creds();
871+ if (!override_cred)
872+ goto out;
873+
874+ /*
875+ * CAP_SYS_ADMIN for setxattr
876+ * CAP_DAC_OVERRIDE for symlink creation
877+ * CAP_FOWNER for unlink in sticky directory
878+ */
879+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
880+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
881+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
882+ override_cred->fsuid = 0;
883+ override_cred->fsgid = 0;
884+ old_cred = override_creds(override_cred);
885+
886+ newdentry = lookup_one_len(dentry->d_name.name, upperdir,
887+ dentry->d_name.len);
888+ err = PTR_ERR(newdentry);
889+ if (IS_ERR(newdentry))
890+ goto out_put_cred;
891+
892+ /* Just been removed within the same locked region */
893+ WARN_ON(newdentry->d_inode);
894+
895+ err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink);
896+ if (err)
897+ goto out_dput;
898+
899+ ovl_dentry_version_inc(dentry->d_parent);
900+
901+ err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0);
902+ if (err)
903+ vfs_unlink(upperdir->d_inode, newdentry);
904+
905+out_dput:
906+ dput(newdentry);
907+out_put_cred:
908+ revert_creds(old_cred);
909+ put_cred(override_cred);
910+out:
911+ if (err) {
912+ /*
913+ * There's no way to recover from failure to whiteout.
914+ * What should we do? Log a big fat error and... ?
915+ */
916+ printk(KERN_ERR "overlayfs: ERROR - failed to whiteout '%s'\n",
917+ dentry->d_name.name);
918+ }
919+
920+ return err;
921+}
922+
923+static struct dentry *ovl_lookup_create(struct dentry *upperdir,
924+ struct dentry *template)
925+{
926+ int err;
927+ struct dentry *newdentry;
928+ struct qstr *name = &template->d_name;
929+
930+ newdentry = lookup_one_len(name->name, upperdir, name->len);
931+ if (IS_ERR(newdentry))
932+ return newdentry;
933+
934+ if (newdentry->d_inode) {
935+ const struct cred *old_cred;
936+ struct cred *override_cred;
937+
938+ /* No need to check whiteout if lower parent is non-existent */
939+ err = -EEXIST;
940+ if (!ovl_dentry_lower(template->d_parent))
941+ goto out_dput;
942+
943+ if (!S_ISLNK(newdentry->d_inode->i_mode))
944+ goto out_dput;
945+
946+ err = -ENOMEM;
947+ override_cred = prepare_creds();
948+ if (!override_cred)
949+ goto out_dput;
950+
951+ /*
952+ * CAP_SYS_ADMIN for getxattr
953+ * CAP_FOWNER for unlink in sticky directory
954+ */
955+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
956+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
957+ old_cred = override_creds(override_cred);
958+
959+ err = -EEXIST;
960+ if (ovl_is_whiteout(newdentry))
961+ err = vfs_unlink(upperdir->d_inode, newdentry);
962+
963+ revert_creds(old_cred);
964+ put_cred(override_cred);
965+ if (err)
966+ goto out_dput;
967+
968+ dput(newdentry);
969+ newdentry = lookup_one_len(name->name, upperdir, name->len);
970+ if (IS_ERR(newdentry)) {
971+ ovl_whiteout(upperdir, template);
972+ return newdentry;
973+ }
974+
975+ /*
976+ * Whiteout just been successfully removed, parent
977+ * i_mutex is still held, there's no way the lookup
978+ * could return positive.
979+ */
980+ WARN_ON(newdentry->d_inode);
981+ }
982+
983+ return newdentry;
984+
985+out_dput:
986+ dput(newdentry);
987+ return ERR_PTR(err);
988+}
989+
990+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
991+ struct kstat *stat, const char *link)
992+{
993+ int err;
994+ struct dentry *newdentry;
995+ struct inode *dir = upperdir->d_inode;
996+
997+ newdentry = ovl_lookup_create(upperdir, dentry);
998+ if (IS_ERR(newdentry))
999+ goto out;
1000+
1001+ switch (stat->mode & S_IFMT) {
1002+ case S_IFREG:
1003+ err = vfs_create(dir, newdentry, stat->mode, NULL);
1004+ break;
1005+
1006+ case S_IFDIR:
1007+ err = vfs_mkdir(dir, newdentry, stat->mode);
1008+ break;
1009+
1010+ case S_IFCHR:
1011+ case S_IFBLK:
1012+ case S_IFIFO:
1013+ case S_IFSOCK:
1014+ err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev);
1015+ break;
1016+
1017+ case S_IFLNK:
1018+ err = vfs_symlink(dir, newdentry, link);
1019+ break;
1020+
1021+ default:
1022+ err = -EPERM;
1023+ }
1024+ if (err) {
1025+ if (ovl_dentry_is_opaque(dentry))
1026+ ovl_whiteout(upperdir, dentry);
1027+ dput(newdentry);
1028+ newdentry = ERR_PTR(err);
1029+ } else if (WARN_ON(!newdentry->d_inode)) {
1030+ /*
1031+ * Not quite sure if non-instantiated dentry is legal or not.
1032+ * VFS doesn't seem to care so check and warn here.
1033+ */
1034+ dput(newdentry);
1035+ newdentry = ERR_PTR(-ENOENT);
1036+ }
1037+
1038+out:
1039+ return newdentry;
1040+
1041+}
1042+
1043+static int ovl_set_opaque(struct dentry *upperdentry)
1044+{
1045+ int err;
1046+ const struct cred *old_cred;
1047+ struct cred *override_cred;
1048+
1049+ override_cred = prepare_creds();
1050+ if (!override_cred)
1051+ return -ENOMEM;
1052+
1053+ /* CAP_SYS_ADMIN for setxattr of "trusted" namespace */
1054+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1055+ old_cred = override_creds(override_cred);
1056+ err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
1057+ revert_creds(old_cred);
1058+ put_cred(override_cred);
1059+
1060+ return err;
1061+}
1062+
1063+static int ovl_remove_opaque(struct dentry *upperdentry)
1064+{
1065+ int err;
1066+ const struct cred *old_cred;
1067+ struct cred *override_cred;
1068+
1069+ override_cred = prepare_creds();
1070+ if (!override_cred)
1071+ return -ENOMEM;
1072+
1073+ /* CAP_SYS_ADMIN for removexattr of "trusted" namespace */
1074+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1075+ old_cred = override_creds(override_cred);
1076+ err = vfs_removexattr(upperdentry, ovl_opaque_xattr);
1077+ revert_creds(old_cred);
1078+ put_cred(override_cred);
1079+
1080+ return err;
1081+}
1082+
1083+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
1084+ struct kstat *stat)
1085+{
1086+ int err;
1087+ enum ovl_path_type type;
1088+ struct path realpath;
1089+
1090+ type = ovl_path_real(dentry, &realpath);
1091+ err = vfs_getattr(realpath.mnt, realpath.dentry, stat);
1092+ if (err)
1093+ return err;
1094+
1095+ stat->dev = dentry->d_sb->s_dev;
1096+ stat->ino = dentry->d_inode->i_ino;
1097+
1098+ /*
1099+ * It's probably not worth it to count subdirs to get the
1100+ * correct link count. nlink=1 seems to pacify 'find' and
1101+ * other utilities.
1102+ */
1103+ if (type == OVL_PATH_MERGE)
1104+ stat->nlink = 1;
1105+
1106+ return 0;
1107+}
1108+
1109+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
1110+ const char *link)
1111+{
1112+ int err;
1113+ struct dentry *newdentry;
1114+ struct dentry *upperdir;
1115+ struct inode *inode;
1116+ struct kstat stat = {
1117+ .mode = mode,
1118+ .rdev = rdev,
1119+ };
1120+
1121+ err = -ENOMEM;
1122+ inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
1123+ if (!inode)
1124+ goto out;
1125+
1126+ err = ovl_copy_up(dentry->d_parent);
1127+ if (err)
1128+ goto out_iput;
1129+
1130+ upperdir = ovl_dentry_upper(dentry->d_parent);
1131+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1132+
1133+ newdentry = ovl_upper_create(upperdir, dentry, &stat, link);
1134+ err = PTR_ERR(newdentry);
1135+ if (IS_ERR(newdentry))
1136+ goto out_unlock;
1137+
1138+ ovl_dentry_version_inc(dentry->d_parent);
1139+ if (ovl_dentry_is_opaque(dentry) && S_ISDIR(mode)) {
1140+ err = ovl_set_opaque(newdentry);
1141+ if (err) {
1142+ vfs_rmdir(upperdir->d_inode, newdentry);
1143+ ovl_whiteout(upperdir, dentry);
1144+ goto out_dput;
1145+ }
1146+ }
1147+ ovl_dentry_update(dentry, newdentry);
1148+ d_instantiate(dentry, inode);
1149+ inode = NULL;
1150+ newdentry = NULL;
1151+ err = 0;
1152+
1153+out_dput:
1154+ dput(newdentry);
1155+out_unlock:
1156+ mutex_unlock(&upperdir->d_inode->i_mutex);
1157+out_iput:
1158+ iput(inode);
1159+out:
1160+ return err;
1161+}
1162+
1163+static int ovl_create(struct inode *dir, struct dentry *dentry, int mode,
1164+ struct nameidata *nd)
1165+{
1166+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
1167+}
1168+
1169+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1170+{
1171+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
1172+}
1173+
1174+static int ovl_mknod(struct inode *dir, struct dentry *dentry, int mode,
1175+ dev_t rdev)
1176+{
1177+ return ovl_create_object(dentry, mode, rdev, NULL);
1178+}
1179+
1180+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
1181+ const char *link)
1182+{
1183+ return ovl_create_object(dentry, S_IFLNK, 0, link);
1184+}
1185+
1186+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
1187+{
1188+ int err;
1189+ enum ovl_path_type type;
1190+ struct path realpath;
1191+ struct dentry *upperdir;
1192+
1193+ err = ovl_copy_up(dentry->d_parent);
1194+ if (err)
1195+ return err;
1196+
1197+ upperdir = ovl_dentry_upper(dentry->d_parent);
1198+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1199+ type = ovl_path_real(dentry, &realpath);
1200+ if (type != OVL_PATH_LOWER) {
1201+ err = -ESTALE;
1202+ if (realpath.dentry->d_parent != upperdir)
1203+ goto out_d_drop;
1204+
1205+ /* FIXME: create whiteout up front and rename to target */
1206+
1207+ if (is_dir)
1208+ err = vfs_rmdir(upperdir->d_inode, realpath.dentry);
1209+ else
1210+ err = vfs_unlink(upperdir->d_inode, realpath.dentry);
1211+ if (err)
1212+ goto out_d_drop;
1213+
1214+ ovl_dentry_version_inc(dentry->d_parent);
1215+ }
1216+
1217+ if (type != OVL_PATH_UPPER || ovl_dentry_is_opaque(dentry))
1218+ err = ovl_whiteout(upperdir, dentry);
1219+
1220+ /*
1221+ * Keeping this dentry hashed would mean having to release
1222+ * upperpath/lowerpath, which could only be done if we are the
1223+ * sole user of this dentry. Too tricky... Just unhash for
1224+ * now.
1225+ */
1226+out_d_drop:
1227+ d_drop(dentry);
1228+ mutex_unlock(&upperdir->d_inode->i_mutex);
1229+
1230+ return err;
1231+}
1232+
1233+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
1234+{
1235+ return ovl_do_remove(dentry, false);
1236+}
1237+
1238+
1239+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
1240+{
1241+ int err;
1242+ enum ovl_path_type type;
1243+
1244+ type = ovl_path_type(dentry);
1245+ if (type != OVL_PATH_UPPER) {
1246+ err = ovl_check_empty_and_clear(dentry, type);
1247+ if (err)
1248+ return err;
1249+ }
1250+
1251+ return ovl_do_remove(dentry, true);
1252+}
1253+
1254+static int ovl_link(struct dentry *old, struct inode *newdir,
1255+ struct dentry *new)
1256+{
1257+ int err;
1258+ struct dentry *olddentry;
1259+ struct dentry *newdentry;
1260+ struct dentry *upperdir;
1261+
1262+ err = ovl_copy_up(old);
1263+ if (err)
1264+ goto out;
1265+
1266+ err = ovl_copy_up(new->d_parent);
1267+ if (err)
1268+ goto out;
1269+
1270+ upperdir = ovl_dentry_upper(new->d_parent);
1271+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1272+ newdentry = ovl_lookup_create(upperdir, new);
1273+ err = PTR_ERR(newdentry);
1274+ if (IS_ERR(newdentry))
1275+ goto out_unlock;
1276+
1277+ olddentry = ovl_dentry_upper(old);
1278+ err = vfs_link(olddentry, upperdir->d_inode, newdentry);
1279+ if (!err) {
1280+ if (WARN_ON(!newdentry->d_inode)) {
1281+ dput(newdentry);
1282+ err = -ENOENT;
1283+ goto out_unlock;
1284+ }
1285+
1286+ ovl_dentry_version_inc(new->d_parent);
1287+ ovl_dentry_update(new, newdentry);
1288+
1289+ ihold(old->d_inode);
1290+ d_instantiate(new, old->d_inode);
1291+ } else {
1292+ if (ovl_dentry_is_opaque(new))
1293+ ovl_whiteout(upperdir, new);
1294+ dput(newdentry);
1295+ }
1296+out_unlock:
1297+ mutex_unlock(&upperdir->d_inode->i_mutex);
1298+out:
1299+ return err;
1300+
1301+}
1302+
1303+static int ovl_rename(struct inode *olddir, struct dentry *old,
1304+ struct inode *newdir, struct dentry *new)
1305+{
1306+ int err;
1307+ enum ovl_path_type old_type;
1308+ enum ovl_path_type new_type;
1309+ struct dentry *old_upperdir;
1310+ struct dentry *new_upperdir;
1311+ struct dentry *olddentry;
1312+ struct dentry *newdentry;
1313+ struct dentry *trap;
1314+ bool old_opaque;
1315+ bool new_opaque;
1316+ bool new_create = false;
1317+ bool is_dir = S_ISDIR(old->d_inode->i_mode);
1318+
1319+ /* Don't copy up directory trees */
1320+ old_type = ovl_path_type(old);
1321+ if (old_type != OVL_PATH_UPPER && is_dir)
1322+ return -EXDEV;
1323+
1324+ if (new->d_inode) {
1325+ new_type = ovl_path_type(new);
1326+
1327+ if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
1328+ if (ovl_dentry_lower(old)->d_inode ==
1329+ ovl_dentry_lower(new)->d_inode)
1330+ return 0;
1331+ }
1332+ if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
1333+ if (ovl_dentry_upper(old)->d_inode ==
1334+ ovl_dentry_upper(new)->d_inode)
1335+ return 0;
1336+ }
1337+
1338+ if (new_type != OVL_PATH_UPPER &&
1339+ S_ISDIR(new->d_inode->i_mode)) {
1340+ err = ovl_check_empty_and_clear(new, new_type);
1341+ if (err)
1342+ return err;
1343+ }
1344+ } else {
1345+ new_type = OVL_PATH_UPPER;
1346+ }
1347+
1348+ err = ovl_copy_up(old);
1349+ if (err)
1350+ return err;
1351+
1352+ err = ovl_copy_up(new->d_parent);
1353+ if (err)
1354+ return err;
1355+
1356+ old_upperdir = ovl_dentry_upper(old->d_parent);
1357+ new_upperdir = ovl_dentry_upper(new->d_parent);
1358+
1359+ trap = lock_rename(new_upperdir, old_upperdir);
1360+
1361+ olddentry = ovl_dentry_upper(old);
1362+ newdentry = ovl_dentry_upper(new);
1363+ if (newdentry) {
1364+ dget(newdentry);
1365+ } else {
1366+ new_create = true;
1367+ newdentry = ovl_lookup_create(new_upperdir, new);
1368+ err = PTR_ERR(newdentry);
1369+ if (IS_ERR(newdentry))
1370+ goto out_unlock;
1371+ }
1372+
1373+ err = -ESTALE;
1374+ if (olddentry->d_parent != old_upperdir)
1375+ goto out_dput;
1376+ if (newdentry->d_parent != new_upperdir)
1377+ goto out_dput;
1378+ if (olddentry == trap)
1379+ goto out_dput;
1380+ if (newdentry == trap)
1381+ goto out_dput;
1382+
1383+ old_opaque = ovl_dentry_is_opaque(old);
1384+ new_opaque = ovl_dentry_is_opaque(new) || new_type != OVL_PATH_UPPER;
1385+
1386+ if (is_dir && !old_opaque && new_opaque) {
1387+ err = ovl_set_opaque(olddentry);
1388+ if (err)
1389+ goto out_dput;
1390+ }
1391+
1392+ err = vfs_rename(old_upperdir->d_inode, olddentry,
1393+ new_upperdir->d_inode, newdentry);
1394+
1395+ if (err) {
1396+ if (new_create && ovl_dentry_is_opaque(new))
1397+ ovl_whiteout(new_upperdir, new);
1398+ if (is_dir && !old_opaque && new_opaque)
1399+ ovl_remove_opaque(olddentry);
1400+ goto out_dput;
1401+ }
1402+
1403+ if (old_type != OVL_PATH_UPPER || old_opaque)
1404+ err = ovl_whiteout(old_upperdir, old);
1405+ if (is_dir && old_opaque && !new_opaque)
1406+ ovl_remove_opaque(olddentry);
1407+
1408+ if (old_opaque != new_opaque)
1409+ ovl_dentry_set_opaque(old, new_opaque);
1410+
1411+ ovl_dentry_version_inc(old->d_parent);
1412+ ovl_dentry_version_inc(new->d_parent);
1413+
1414+out_dput:
1415+ dput(newdentry);
1416+out_unlock:
1417+ unlock_rename(new_upperdir, old_upperdir);
1418+ return err;
1419+}
1420+
1421+const struct inode_operations ovl_dir_inode_operations = {
1422+ .lookup = ovl_lookup,
1423+ .mkdir = ovl_mkdir,
1424+ .symlink = ovl_symlink,
1425+ .unlink = ovl_unlink,
1426+ .rmdir = ovl_rmdir,
1427+ .rename = ovl_rename,
1428+ .link = ovl_link,
1429+ .setattr = ovl_setattr,
1430+ .create = ovl_create,
1431+ .mknod = ovl_mknod,
1432+ .permission = ovl_permission,
1433+ .getattr = ovl_dir_getattr,
1434+ .setxattr = ovl_setxattr,
1435+ .getxattr = ovl_getxattr,
1436+ .listxattr = ovl_listxattr,
1437+ .removexattr = ovl_removexattr,
1438+};
1439--- /dev/null
1440+++ b/fs/overlayfs/inode.c
1441@@ -0,0 +1,383 @@
1442+/*
1443+ *
1444+ * Copyright (C) 2011 Novell Inc.
1445+ *
1446+ * This program is free software; you can redistribute it and/or modify it
1447+ * under the terms of the GNU General Public License version 2 as published by
1448+ * the Free Software Foundation.
1449+ */
1450+
1451+#include <linux/fs.h>
1452+#include <linux/slab.h>
1453+#include <linux/xattr.h>
1454+#include "overlayfs.h"
1455+
1456+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
1457+{
1458+ struct dentry *upperdentry;
1459+ int err;
1460+
1461+ if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry))
1462+ err = ovl_copy_up_truncate(dentry, attr->ia_size);
1463+ else
1464+ err = ovl_copy_up(dentry);
1465+ if (err)
1466+ return err;
1467+
1468+ upperdentry = ovl_dentry_upper(dentry);
1469+
1470+ if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
1471+ attr->ia_valid &= ~ATTR_MODE;
1472+
1473+ mutex_lock(&upperdentry->d_inode->i_mutex);
1474+ err = notify_change(upperdentry, attr);
1475+ mutex_unlock(&upperdentry->d_inode->i_mutex);
1476+
1477+ return err;
1478+}
1479+
1480+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
1481+ struct kstat *stat)
1482+{
1483+ struct path realpath;
1484+
1485+ ovl_path_real(dentry, &realpath);
1486+ return vfs_getattr(realpath.mnt, realpath.dentry, stat);
1487+}
1488+
1489+int ovl_permission(struct inode *inode, int mask)
1490+{
1491+ struct ovl_entry *oe;
1492+ struct dentry *alias = NULL;
1493+ struct inode *realinode;
1494+ struct dentry *realdentry;
1495+ bool is_upper;
1496+ int err;
1497+
1498+ if (S_ISDIR(inode->i_mode)) {
1499+ oe = inode->i_private;
1500+ } else if (mask & MAY_NOT_BLOCK) {
1501+ return -ECHILD;
1502+ } else {
1503+ /*
1504+ * For non-directories find an alias and get the info
1505+ * from there.
1506+ */
1507+ spin_lock(&inode->i_lock);
1508+ if (WARN_ON(list_empty(&inode->i_dentry))) {
1509+ spin_unlock(&inode->i_lock);
1510+ return -ENOENT;
1511+ }
1512+ alias = list_entry(inode->i_dentry.next, struct dentry, d_alias);
1513+ dget(alias);
1514+ spin_unlock(&inode->i_lock);
1515+ oe = alias->d_fsdata;
1516+ }
1517+
1518+ realdentry = ovl_entry_real(oe, &is_upper);
1519+
1520+ /* Careful in RCU walk mode */
1521+ realinode = ACCESS_ONCE(realdentry->d_inode);
1522+ if (!realinode) {
1523+ WARN_ON(!(mask & MAY_NOT_BLOCK));
1524+ err = -ENOENT;
1525+ goto out_dput;
1526+ }
1527+
1528+ if (mask & MAY_WRITE) {
1529+ umode_t mode = realinode->i_mode;
1530+
1531+ /*
1532+ * Writes will always be redirected to upper layer, so
1533+ * ignore lower layer being read-only.
1534+ *
1535+ * If the overlay itself is read-only then proceed
1536+ * with the permission check, don't return EROFS.
1537+ * This will only happen if this is the lower layer of
1538+ * another overlayfs.
1539+ *
1540+ * If upper fs becomes read-only after the overlay was
1541+ * constructed return EROFS to prevent modification of
1542+ * upper layer.
1543+ */
1544+ err = -EROFS;
1545+ if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
1546+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1547+ goto out_dput;
1548+
1549+ /*
1550+ * Nobody gets write access to an immutable file.
1551+ */
1552+ err = -EACCES;
1553+ if (IS_IMMUTABLE(realinode))
1554+ goto out_dput;
1555+ }
1556+
1557+ if (realinode->i_op->permission)
1558+ err = realinode->i_op->permission(realinode, mask);
1559+ else
1560+ err = generic_permission(realinode, mask);
1561+out_dput:
1562+ dput(alias);
1563+ return err;
1564+}
1565+
1566+
1567+struct ovl_link_data {
1568+ struct dentry *realdentry;
1569+ void *cookie;
1570+};
1571+
1572+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
1573+{
1574+ void *ret;
1575+ struct dentry *realdentry;
1576+ struct inode *realinode;
1577+
1578+ realdentry = ovl_dentry_real(dentry);
1579+ realinode = realdentry->d_inode;
1580+
1581+ if (WARN_ON(!realinode->i_op->follow_link))
1582+ return ERR_PTR(-EPERM);
1583+
1584+ ret = realinode->i_op->follow_link(realdentry, nd);
1585+ if (IS_ERR(ret))
1586+ return ret;
1587+
1588+ if (realinode->i_op->put_link) {
1589+ struct ovl_link_data *data;
1590+
1591+ data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
1592+ if (!data) {
1593+ realinode->i_op->put_link(realdentry, nd, ret);
1594+ return ERR_PTR(-ENOMEM);
1595+ }
1596+ data->realdentry = realdentry;
1597+ data->cookie = ret;
1598+
1599+ return data;
1600+ } else {
1601+ return NULL;
1602+ }
1603+}
1604+
1605+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
1606+{
1607+ struct inode *realinode;
1608+ struct ovl_link_data *data = c;
1609+
1610+ if (!data)
1611+ return;
1612+
1613+ realinode = data->realdentry->d_inode;
1614+ realinode->i_op->put_link(data->realdentry, nd, data->cookie);
1615+ kfree(data);
1616+}
1617+
1618+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
1619+{
1620+ struct path realpath;
1621+ struct inode *realinode;
1622+
1623+ ovl_path_real(dentry, &realpath);
1624+ realinode = realpath.dentry->d_inode;
1625+
1626+ if (!realinode->i_op->readlink)
1627+ return -EINVAL;
1628+
1629+ touch_atime(realpath.mnt, realpath.dentry);
1630+
1631+ return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
1632+}
1633+
1634+
1635+static bool ovl_is_private_xattr(const char *name)
1636+{
1637+ return strncmp(name, "trusted.overlay.", 14) == 0;
1638+}
1639+
1640+int ovl_setxattr(struct dentry *dentry, const char *name,
1641+ const void *value, size_t size, int flags)
1642+{
1643+ int err;
1644+ struct dentry *upperdentry;
1645+
1646+ if (ovl_is_private_xattr(name))
1647+ return -EPERM;
1648+
1649+ err = ovl_copy_up(dentry);
1650+ if (err)
1651+ return err;
1652+
1653+ upperdentry = ovl_dentry_upper(dentry);
1654+ return vfs_setxattr(upperdentry, name, value, size, flags);
1655+}
1656+
1657+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
1658+ void *value, size_t size)
1659+{
1660+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
1661+ ovl_is_private_xattr(name))
1662+ return -ENODATA;
1663+
1664+ return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
1665+}
1666+
1667+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
1668+{
1669+ ssize_t res;
1670+ int off;
1671+
1672+ res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
1673+ if (res <= 0 || size == 0)
1674+ return res;
1675+
1676+ if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
1677+ return res;
1678+
1679+ /* filter out private xattrs */
1680+ for (off = 0; off < res;) {
1681+ char *s = list + off;
1682+ size_t slen = strlen(s) + 1;
1683+
1684+ BUG_ON(off + slen > res);
1685+
1686+ if (ovl_is_private_xattr(s)) {
1687+ res -= slen;
1688+ memmove(s, s + slen, res - off);
1689+ } else {
1690+ off += slen;
1691+ }
1692+ }
1693+
1694+ return res;
1695+}
1696+
1697+int ovl_removexattr(struct dentry *dentry, const char *name)
1698+{
1699+ int err;
1700+ struct path realpath;
1701+ enum ovl_path_type type;
1702+
1703+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
1704+ ovl_is_private_xattr(name))
1705+ return -ENODATA;
1706+
1707+ type = ovl_path_real(dentry, &realpath);
1708+ if (type == OVL_PATH_LOWER) {
1709+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
1710+ if (err < 0)
1711+ return err;
1712+
1713+ err = ovl_copy_up(dentry);
1714+ if (err)
1715+ return err;
1716+
1717+ ovl_path_upper(dentry, &realpath);
1718+ }
1719+
1720+ return vfs_removexattr(realpath.dentry, name);
1721+}
1722+
1723+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
1724+ struct dentry *realdentry)
1725+{
1726+ if (type != OVL_PATH_LOWER)
1727+ return false;
1728+
1729+ if (special_file(realdentry->d_inode->i_mode))
1730+ return false;
1731+
1732+ if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
1733+ return false;
1734+
1735+ return true;
1736+}
1737+
1738+static struct file *ovl_open(struct dentry *dentry, int flags,
1739+ const struct cred *cred)
1740+{
1741+ int err;
1742+ struct path realpath;
1743+ enum ovl_path_type type;
1744+
1745+ type = ovl_path_real(dentry, &realpath);
1746+ if (ovl_open_need_copy_up(flags, type, realpath.dentry)) {
1747+ if (flags & O_TRUNC)
1748+ err = ovl_copy_up_truncate(dentry, 0);
1749+ else
1750+ err = ovl_copy_up(dentry);
1751+ if (err)
1752+ return ERR_PTR(err);
1753+
1754+ ovl_path_upper(dentry, &realpath);
1755+ }
1756+
1757+ return vfs_open(&realpath, flags, cred);
1758+}
1759+
1760+static const struct inode_operations ovl_file_inode_operations = {
1761+ .setattr = ovl_setattr,
1762+ .permission = ovl_permission,
1763+ .getattr = ovl_getattr,
1764+ .setxattr = ovl_setxattr,
1765+ .getxattr = ovl_getxattr,
1766+ .listxattr = ovl_listxattr,
1767+ .removexattr = ovl_removexattr,
1768+ .open = ovl_open,
1769+};
1770+
1771+static const struct inode_operations ovl_symlink_inode_operations = {
1772+ .setattr = ovl_setattr,
1773+ .follow_link = ovl_follow_link,
1774+ .put_link = ovl_put_link,
1775+ .readlink = ovl_readlink,
1776+ .getattr = ovl_getattr,
1777+ .setxattr = ovl_setxattr,
1778+ .getxattr = ovl_getxattr,
1779+ .listxattr = ovl_listxattr,
1780+ .removexattr = ovl_removexattr,
1781+};
1782+
1783+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
1784+ struct ovl_entry *oe)
1785+{
1786+ struct inode *inode;
1787+
1788+ inode = new_inode(sb);
1789+ if (!inode)
1790+ return NULL;
1791+
1792+ mode &= S_IFMT;
1793+
1794+ inode->i_ino = get_next_ino();
1795+ inode->i_mode = mode;
1796+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
1797+
1798+ switch (mode) {
1799+ case S_IFDIR:
1800+ inode->i_private = oe;
1801+ inode->i_op = &ovl_dir_inode_operations;
1802+ inode->i_fop = &ovl_dir_operations;
1803+ break;
1804+
1805+ case S_IFLNK:
1806+ inode->i_op = &ovl_symlink_inode_operations;
1807+ break;
1808+
1809+ case S_IFREG:
1810+ case S_IFSOCK:
1811+ case S_IFBLK:
1812+ case S_IFCHR:
1813+ case S_IFIFO:
1814+ inode->i_op = &ovl_file_inode_operations;
1815+ break;
1816+
1817+ default:
1818+ WARN(1, "illegal file type: %i\n", mode);
1819+ inode = NULL;
1820+ }
1821+
1822+ return inode;
1823+
1824+}
1825--- /dev/null
1826+++ b/fs/overlayfs/overlayfs.h
1827@@ -0,0 +1,63 @@
1828+/*
1829+ *
1830+ * Copyright (C) 2011 Novell Inc.
1831+ *
1832+ * This program is free software; you can redistribute it and/or modify it
1833+ * under the terms of the GNU General Public License version 2 as published by
1834+ * the Free Software Foundation.
1835+ */
1836+
1837+struct ovl_entry;
1838+
1839+enum ovl_path_type {
1840+ OVL_PATH_UPPER,
1841+ OVL_PATH_MERGE,
1842+ OVL_PATH_LOWER,
1843+};
1844+
1845+extern const char *ovl_opaque_xattr;
1846+extern const char *ovl_whiteout_xattr;
1847+extern const struct dentry_operations ovl_dentry_operations;
1848+
1849+enum ovl_path_type ovl_path_type(struct dentry *dentry);
1850+u64 ovl_dentry_version_get(struct dentry *dentry);
1851+void ovl_dentry_version_inc(struct dentry *dentry);
1852+void ovl_path_upper(struct dentry *dentry, struct path *path);
1853+void ovl_path_lower(struct dentry *dentry, struct path *path);
1854+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
1855+struct dentry *ovl_dentry_upper(struct dentry *dentry);
1856+struct dentry *ovl_dentry_lower(struct dentry *dentry);
1857+struct dentry *ovl_dentry_real(struct dentry *dentry);
1858+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
1859+bool ovl_dentry_is_opaque(struct dentry *dentry);
1860+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
1861+bool ovl_is_whiteout(struct dentry *dentry);
1862+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
1863+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
1864+ struct nameidata *nd);
1865+
1866+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
1867+ struct kstat *stat, const char *link);
1868+
1869+/* readdir.c */
1870+extern const struct file_operations ovl_dir_operations;
1871+int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type);
1872+
1873+/* inode.c */
1874+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
1875+int ovl_permission(struct inode *inode, int mask);
1876+int ovl_setxattr(struct dentry *dentry, const char *name,
1877+ const void *value, size_t size, int flags);
1878+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
1879+ void *value, size_t size);
1880+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
1881+int ovl_removexattr(struct dentry *dentry, const char *name);
1882+
1883+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
1884+ struct ovl_entry *oe);
1885+/* dir.c */
1886+extern const struct inode_operations ovl_dir_inode_operations;
1887+
1888+/* copy_up.c */
1889+int ovl_copy_up(struct dentry *dentry);
1890+int ovl_copy_up_truncate(struct dentry *dentry, loff_t size);
1891--- /dev/null
1892+++ b/fs/overlayfs/readdir.c
1893@@ -0,0 +1,559 @@
1894+/*
1895+ *
1896+ * Copyright (C) 2011 Novell Inc.
1897+ *
1898+ * This program is free software; you can redistribute it and/or modify it
1899+ * under the terms of the GNU General Public License version 2 as published by
1900+ * the Free Software Foundation.
1901+ */
1902+
1903+#include <linux/fs.h>
1904+#include <linux/slab.h>
1905+#include <linux/namei.h>
1906+#include <linux/file.h>
1907+#include <linux/xattr.h>
1908+#include <linux/rbtree.h>
1909+#include <linux/security.h>
1910+#include "overlayfs.h"
1911+
1912+struct ovl_cache_entry {
1913+ const char *name;
1914+ unsigned int len;
1915+ unsigned int type;
1916+ u64 ino;
1917+ bool is_whiteout;
1918+ struct list_head l_node;
1919+ struct rb_node node;
1920+};
1921+
1922+struct ovl_readdir_data {
1923+ struct rb_root *root;
1924+ struct list_head *list;
1925+ struct list_head *middle;
1926+ struct dentry *dir;
1927+ int count;
1928+ int err;
1929+};
1930+
1931+struct ovl_dir_file {
1932+ bool is_real;
1933+ bool is_cached;
1934+ struct list_head cursor;
1935+ u64 cache_version;
1936+ struct list_head cache;
1937+ struct file *realfile;
1938+};
1939+
1940+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
1941+{
1942+ return container_of(n, struct ovl_cache_entry, node);
1943+}
1944+
1945+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
1946+ const char *name, int len)
1947+{
1948+ struct rb_node *node = root->rb_node;
1949+ int cmp;
1950+
1951+ while (node) {
1952+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
1953+
1954+ cmp = strncmp(name, p->name, len);
1955+ if (cmp > 0)
1956+ node = p->node.rb_right;
1957+ else if (cmp < 0 || len < p->len)
1958+ node = p->node.rb_left;
1959+ else
1960+ return p;
1961+ }
1962+
1963+ return NULL;
1964+}
1965+
1966+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
1967+ u64 ino, unsigned int d_type)
1968+{
1969+ struct ovl_cache_entry *p;
1970+
1971+ p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL);
1972+ if (p) {
1973+ char *name_copy = (char *) (p + 1);
1974+ memcpy(name_copy, name, len);
1975+ name_copy[len] = '\0';
1976+ p->name = name_copy;
1977+ p->len = len;
1978+ p->type = d_type;
1979+ p->ino = ino;
1980+ p->is_whiteout = false;
1981+ }
1982+
1983+ return p;
1984+}
1985+
1986+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
1987+ const char *name, int len, u64 ino,
1988+ unsigned int d_type)
1989+{
1990+ struct rb_node **newp = &rdd->root->rb_node;
1991+ struct rb_node *parent = NULL;
1992+ struct ovl_cache_entry *p;
1993+
1994+ while (*newp) {
1995+ int cmp;
1996+ struct ovl_cache_entry *tmp;
1997+
1998+ parent = *newp;
1999+ tmp = ovl_cache_entry_from_node(*newp);
2000+ cmp = strncmp(name, tmp->name, len);
2001+ if (cmp > 0)
2002+ newp = &tmp->node.rb_right;
2003+ else if (cmp < 0 || len < tmp->len)
2004+ newp = &tmp->node.rb_left;
2005+ else
2006+ return 0;
2007+ }
2008+
2009+ p = ovl_cache_entry_new(name, len, ino, d_type);
2010+ if (p == NULL)
2011+ return -ENOMEM;
2012+
2013+ list_add_tail(&p->l_node, rdd->list);
2014+ rb_link_node(&p->node, parent, newp);
2015+ rb_insert_color(&p->node, rdd->root);
2016+
2017+ return 0;
2018+}
2019+
2020+static int ovl_fill_lower(void *buf, const char *name, int namelen,
2021+ loff_t offset, u64 ino, unsigned int d_type)
2022+{
2023+ struct ovl_readdir_data *rdd = buf;
2024+ struct ovl_cache_entry *p;
2025+
2026+ rdd->count++;
2027+ p = ovl_cache_entry_find(rdd->root, name, namelen);
2028+ if (p) {
2029+ list_move_tail(&p->l_node, rdd->middle);
2030+ } else {
2031+ p = ovl_cache_entry_new(name, namelen, ino, d_type);
2032+ if (p == NULL)
2033+ rdd->err = -ENOMEM;
2034+ else
2035+ list_add_tail(&p->l_node, rdd->middle);
2036+ }
2037+
2038+ return rdd->err;
2039+}
2040+
2041+static void ovl_cache_free(struct list_head *list)
2042+{
2043+ struct ovl_cache_entry *p;
2044+ struct ovl_cache_entry *n;
2045+
2046+ list_for_each_entry_safe(p, n, list, l_node)
2047+ kfree(p);
2048+
2049+ INIT_LIST_HEAD(list);
2050+}
2051+
2052+static int ovl_fill_upper(void *buf, const char *name, int namelen,
2053+ loff_t offset, u64 ino, unsigned int d_type)
2054+{
2055+ struct ovl_readdir_data *rdd = buf;
2056+
2057+ rdd->count++;
2058+ return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
2059+}
2060+
2061+static inline int ovl_dir_read(struct path *realpath,
2062+ struct ovl_readdir_data *rdd, filldir_t filler)
2063+{
2064+ struct file *realfile;
2065+ int err;
2066+
2067+ realfile = vfs_open(realpath, O_RDONLY | O_DIRECTORY, current_cred());
2068+ if (IS_ERR(realfile))
2069+ return PTR_ERR(realfile);
2070+
2071+ do {
2072+ rdd->count = 0;
2073+ rdd->err = 0;
2074+ err = vfs_readdir(realfile, filler, rdd);
2075+ if (err >= 0)
2076+ err = rdd->err;
2077+ } while (!err && rdd->count);
2078+ fput(realfile);
2079+
2080+ return 0;
2081+}
2082+
2083+static void ovl_dir_reset(struct file *file)
2084+{
2085+ struct ovl_dir_file *od = file->private_data;
2086+ enum ovl_path_type type = ovl_path_type(file->f_path.dentry);
2087+
2088+ if (ovl_dentry_version_get(file->f_path.dentry) != od->cache_version) {
2089+ list_del_init(&od->cursor);
2090+ ovl_cache_free(&od->cache);
2091+ od->is_cached = false;
2092+ }
2093+ WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
2094+ if (od->is_real && type == OVL_PATH_MERGE) {
2095+ fput(od->realfile);
2096+ od->realfile = NULL;
2097+ od->is_real = false;
2098+ }
2099+}
2100+
2101+static int ovl_dir_mark_whiteouts(struct ovl_readdir_data *rdd)
2102+{
2103+ struct ovl_cache_entry *p;
2104+ struct dentry *dentry;
2105+ const struct cred *old_cred;
2106+ struct cred *override_cred;
2107+
2108+ override_cred = prepare_creds();
2109+ if (!override_cred) {
2110+ ovl_cache_free(rdd->list);
2111+ return -ENOMEM;
2112+ }
2113+
2114+ /*
2115+ * CAP_SYS_ADMIN for getxattr
2116+ * CAP_DAC_OVERRIDE for lookup
2117+ */
2118+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2119+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
2120+ old_cred = override_creds(override_cred);
2121+
2122+ mutex_lock(&rdd->dir->d_inode->i_mutex);
2123+ list_for_each_entry(p, rdd->list, l_node) {
2124+ if (p->type != DT_LNK)
2125+ continue;
2126+
2127+ dentry = lookup_one_len(p->name, rdd->dir, p->len);
2128+ if (IS_ERR(dentry))
2129+ continue;
2130+
2131+ p->is_whiteout = ovl_is_whiteout(dentry);
2132+ dput(dentry);
2133+ }
2134+ mutex_unlock(&rdd->dir->d_inode->i_mutex);
2135+
2136+ revert_creds(old_cred);
2137+ put_cred(override_cred);
2138+
2139+ return 0;
2140+}
2141+
2142+static inline int ovl_dir_read_merged(struct path *upperpath, struct path *lowerpath,
2143+ struct ovl_readdir_data *rdd)
2144+{
2145+ int err;
2146+ struct rb_root root = RB_ROOT;
2147+ struct list_head middle;
2148+
2149+ rdd->root = &root;
2150+ if (upperpath->dentry) {
2151+ rdd->dir = upperpath->dentry;
2152+ err = ovl_dir_read(upperpath, rdd, ovl_fill_upper);
2153+ if (err)
2154+ goto out;
2155+
2156+ err = ovl_dir_mark_whiteouts(rdd);
2157+ if (err)
2158+ goto out;
2159+ }
2160+ /*
2161+ * Insert lowerpath entries before upperpath ones, this allows
2162+ * offsets to be reasonably constant
2163+ */
2164+ list_add(&middle, rdd->list);
2165+ rdd->middle = &middle;
2166+ err = ovl_dir_read(lowerpath, rdd, ovl_fill_lower);
2167+ list_del(&middle);
2168+out:
2169+ rdd->root = NULL;
2170+
2171+ return err;
2172+}
2173+
2174+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
2175+{
2176+ struct list_head *l;
2177+ loff_t off;
2178+
2179+ l = od->cache.next;
2180+ for (off = 0; off < pos; off++) {
2181+ if (l == &od->cache)
2182+ break;
2183+ l = l->next;
2184+ }
2185+ list_move_tail(&od->cursor, l);
2186+}
2187+
2188+static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
2189+{
2190+ struct ovl_dir_file *od = file->private_data;
2191+ int res;
2192+
2193+ if (!file->f_pos)
2194+ ovl_dir_reset(file);
2195+
2196+ if (od->is_real) {
2197+ res = vfs_readdir(od->realfile, filler, buf);
2198+ file->f_pos = od->realfile->f_pos;
2199+
2200+ return res;
2201+ }
2202+
2203+ if (!od->is_cached) {
2204+ struct path lowerpath;
2205+ struct path upperpath;
2206+ struct ovl_readdir_data rdd = { .list = &od->cache };
2207+
2208+ ovl_path_lower(file->f_path.dentry, &lowerpath);
2209+ ovl_path_upper(file->f_path.dentry, &upperpath);
2210+
2211+ res = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2212+ if (res) {
2213+ ovl_cache_free(rdd.list);
2214+ return res;
2215+ }
2216+
2217+ od->cache_version = ovl_dentry_version_get(file->f_path.dentry);
2218+ od->is_cached = true;
2219+
2220+ ovl_seek_cursor(od, file->f_pos);
2221+ }
2222+
2223+ while (od->cursor.next != &od->cache) {
2224+ int over;
2225+ loff_t off;
2226+ struct ovl_cache_entry *p;
2227+
2228+ p = list_entry(od->cursor.next, struct ovl_cache_entry, l_node);
2229+ off = file->f_pos;
2230+ if (!p->is_whiteout) {
2231+ over = filler(buf, p->name, p->len, off, p->ino, p->type);
2232+ if (over)
2233+ break;
2234+ }
2235+ file->f_pos++;
2236+ list_move(&od->cursor, &p->l_node);
2237+ }
2238+
2239+ return 0;
2240+}
2241+
2242+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
2243+{
2244+ loff_t res;
2245+ struct ovl_dir_file *od = file->private_data;
2246+
2247+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
2248+ if (!file->f_pos)
2249+ ovl_dir_reset(file);
2250+
2251+ if (od->is_real) {
2252+ res = vfs_llseek(od->realfile, offset, origin);
2253+ file->f_pos = od->realfile->f_pos;
2254+ } else {
2255+ res = -EINVAL;
2256+
2257+ switch (origin) {
2258+ case SEEK_CUR:
2259+ offset += file->f_pos;
2260+ break;
2261+ case SEEK_SET:
2262+ break;
2263+ default:
2264+ goto out_unlock;
2265+ }
2266+ if (offset < 0)
2267+ goto out_unlock;
2268+
2269+ if (offset != file->f_pos) {
2270+ file->f_pos = offset;
2271+ if (od->is_cached)
2272+ ovl_seek_cursor(od, offset);
2273+ }
2274+ res = offset;
2275+ }
2276+out_unlock:
2277+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
2278+
2279+ return res;
2280+}
2281+
2282+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
2283+ int datasync)
2284+{
2285+ struct ovl_dir_file *od = file->private_data;
2286+
2287+ /* May need to reopen directory if it got copied up */
2288+ if (!od->realfile) {
2289+ struct path upperpath;
2290+
2291+ ovl_path_upper(file->f_path.dentry, &upperpath);
2292+ od->realfile = vfs_open(&upperpath, O_RDONLY, current_cred());
2293+ if (IS_ERR(od->realfile))
2294+ return PTR_ERR(od->realfile);
2295+ }
2296+
2297+ return vfs_fsync_range(od->realfile, start, end, datasync);
2298+}
2299+
2300+static int ovl_dir_release(struct inode *inode, struct file *file)
2301+{
2302+ struct ovl_dir_file *od = file->private_data;
2303+
2304+ list_del(&od->cursor);
2305+ ovl_cache_free(&od->cache);
2306+ if (od->realfile)
2307+ fput(od->realfile);
2308+ kfree(od);
2309+
2310+ return 0;
2311+}
2312+
2313+static int ovl_dir_open(struct inode *inode, struct file *file)
2314+{
2315+ struct path realpath;
2316+ struct file *realfile;
2317+ struct ovl_dir_file *od;
2318+ enum ovl_path_type type;
2319+
2320+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
2321+ if (!od)
2322+ return -ENOMEM;
2323+
2324+ type = ovl_path_real(file->f_path.dentry, &realpath);
2325+ realfile = vfs_open(&realpath, file->f_flags, current_cred());
2326+ if (IS_ERR(realfile)) {
2327+ kfree(od);
2328+ return PTR_ERR(realfile);
2329+ }
2330+ INIT_LIST_HEAD(&od->cache);
2331+ INIT_LIST_HEAD(&od->cursor);
2332+ od->is_cached = false;
2333+ od->realfile = realfile;
2334+ od->is_real = (type != OVL_PATH_MERGE);
2335+ file->private_data = od;
2336+
2337+ return 0;
2338+}
2339+
2340+const struct file_operations ovl_dir_operations = {
2341+ .read = generic_read_dir,
2342+ .open = ovl_dir_open,
2343+ .readdir = ovl_readdir,
2344+ .llseek = ovl_dir_llseek,
2345+ .fsync = ovl_dir_fsync,
2346+ .release = ovl_dir_release,
2347+};
2348+
2349+static int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
2350+{
2351+ int err;
2352+ struct path lowerpath;
2353+ struct path upperpath;
2354+ struct ovl_cache_entry *p;
2355+ struct ovl_readdir_data rdd = { .list = list };
2356+
2357+ ovl_path_upper(dentry, &upperpath);
2358+ ovl_path_lower(dentry, &lowerpath);
2359+
2360+ err = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2361+ if (err)
2362+ return err;
2363+
2364+ err = 0;
2365+
2366+ list_for_each_entry(p, list, l_node) {
2367+ if (p->is_whiteout)
2368+ continue;
2369+
2370+ if (p->name[0] == '.') {
2371+ if (p->len == 1)
2372+ continue;
2373+ if (p->len == 2 && p->name[1] == '.')
2374+ continue;
2375+ }
2376+ err = -ENOTEMPTY;
2377+ break;
2378+ }
2379+
2380+ return err;
2381+}
2382+
2383+static int ovl_remove_whiteouts(struct dentry *dir, struct list_head *list)
2384+{
2385+ struct path upperpath;
2386+ struct dentry *upperdir;
2387+ struct ovl_cache_entry *p;
2388+ const struct cred *old_cred;
2389+ struct cred *override_cred;
2390+ int err;
2391+
2392+ ovl_path_upper(dir, &upperpath);
2393+ upperdir = upperpath.dentry;
2394+
2395+ override_cred = prepare_creds();
2396+ if (!override_cred)
2397+ return -ENOMEM;
2398+
2399+ /*
2400+ * CAP_DAC_OVERRIDE for lookup and unlink
2401+ * CAP_SYS_ADMIN for setxattr of "trusted" namespace
2402+ * CAP_FOWNER for unlink in sticky directory
2403+ */
2404+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
2405+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2406+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
2407+ old_cred = override_creds(override_cred);
2408+
2409+ err = vfs_setxattr(upperdir, ovl_opaque_xattr, "y", 1, 0);
2410+ if (err)
2411+ goto out_revert_creds;
2412+
2413+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
2414+ list_for_each_entry(p, list, l_node) {
2415+ struct dentry *dentry;
2416+ int ret;
2417+
2418+ if (!p->is_whiteout)
2419+ continue;
2420+
2421+ dentry = lookup_one_len(p->name, upperdir, p->len);
2422+ if (IS_ERR(dentry)) {
2423+ printk(KERN_WARNING "overlayfs: failed to lookup whiteout %.*s: %li\n", p->len, p->name, PTR_ERR(dentry));
2424+ continue;
2425+ }
2426+ ret = vfs_unlink(upperdir->d_inode, dentry);
2427+ dput(dentry);
2428+ if (ret)
2429+ printk(KERN_WARNING "overlayfs: failed to unlink whiteout %.*s: %i\n", p->len, p->name, ret);
2430+ }
2431+ mutex_unlock(&upperdir->d_inode->i_mutex);
2432+
2433+out_revert_creds:
2434+ revert_creds(old_cred);
2435+ put_cred(override_cred);
2436+
2437+ return err;
2438+}
2439+
2440+int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type)
2441+{
2442+ int err;
2443+ LIST_HEAD(list);
2444+
2445+ err = ovl_check_empty_dir(dentry, &list);
2446+ if (!err && type == OVL_PATH_MERGE)
2447+ err = ovl_remove_whiteouts(dentry, &list);
2448+
2449+ ovl_cache_free(&list);
2450+
2451+ return err;
2452+}
2453--- /dev/null
2454+++ b/fs/overlayfs/super.c
2455@@ -0,0 +1,656 @@
2456+/*
2457+ *
2458+ * Copyright (C) 2011 Novell Inc.
2459+ *
2460+ * This program is free software; you can redistribute it and/or modify it
2461+ * under the terms of the GNU General Public License version 2 as published by
2462+ * the Free Software Foundation.
2463+ */
2464+
2465+#include <linux/fs.h>
2466+#include <linux/namei.h>
2467+#include <linux/xattr.h>
2468+#include <linux/security.h>
2469+#include <linux/mount.h>
2470+#include <linux/slab.h>
2471+#include <linux/parser.h>
2472+#include <linux/module.h>
2473+#include <linux/seq_file.h>
2474+#include "overlayfs.h"
2475+
2476+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
2477+MODULE_DESCRIPTION("Overlay filesystem");
2478+MODULE_LICENSE("GPL");
2479+
2480+struct ovl_config {
2481+ char *lowerdir;
2482+ char *upperdir;
2483+};
2484+
2485+/* private information held for overlayfs's superblock */
2486+struct ovl_fs {
2487+ struct vfsmount *upper_mnt;
2488+ struct vfsmount *lower_mnt;
2489+ /* pathnames of lower and upper dirs, for show_options */
2490+ struct ovl_config config;
2491+};
2492+
2493+/* private information held for every overlayfs dentry */
2494+struct ovl_entry {
2495+ /*
2496+ * Keep "double reference" on upper dentries, so that
2497+ * d_delete() doesn't think it's OK to reset d_inode to NULL.
2498+ */
2499+ struct dentry *__upperdentry;
2500+ struct dentry *lowerdentry;
2501+ union {
2502+ struct {
2503+ u64 version;
2504+ bool opaque;
2505+ };
2506+ struct rcu_head rcu;
2507+ };
2508+};
2509+
2510+const char *ovl_whiteout_xattr = "trusted.overlay.whiteout";
2511+const char *ovl_opaque_xattr = "trusted.overlay.opaque";
2512+
2513+
2514+enum ovl_path_type ovl_path_type(struct dentry *dentry)
2515+{
2516+ struct ovl_entry *oe = dentry->d_fsdata;
2517+
2518+ if (oe->__upperdentry) {
2519+ if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode))
2520+ return OVL_PATH_MERGE;
2521+ else
2522+ return OVL_PATH_UPPER;
2523+ } else {
2524+ return OVL_PATH_LOWER;
2525+ }
2526+}
2527+
2528+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
2529+{
2530+ struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
2531+ smp_read_barrier_depends();
2532+ return upperdentry;
2533+}
2534+
2535+void ovl_path_upper(struct dentry *dentry, struct path *path)
2536+{
2537+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
2538+ struct ovl_entry *oe = dentry->d_fsdata;
2539+
2540+ path->mnt = ofs->upper_mnt;
2541+ path->dentry = ovl_upperdentry_dereference(oe);
2542+}
2543+
2544+void ovl_path_lower(struct dentry *dentry, struct path *path)
2545+{
2546+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
2547+ struct ovl_entry *oe = dentry->d_fsdata;
2548+
2549+ path->mnt = ofs->lower_mnt;
2550+ path->dentry = oe->lowerdentry;
2551+}
2552+
2553+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
2554+{
2555+
2556+ enum ovl_path_type type = ovl_path_type(dentry);
2557+
2558+ if (type == OVL_PATH_LOWER)
2559+ ovl_path_lower(dentry, path);
2560+ else
2561+ ovl_path_upper(dentry, path);
2562+
2563+ return type;
2564+}
2565+
2566+struct dentry *ovl_dentry_upper(struct dentry *dentry)
2567+{
2568+ struct ovl_entry *oe = dentry->d_fsdata;
2569+
2570+ return ovl_upperdentry_dereference(oe);
2571+}
2572+
2573+struct dentry *ovl_dentry_lower(struct dentry *dentry)
2574+{
2575+ struct ovl_entry *oe = dentry->d_fsdata;
2576+
2577+ return oe->lowerdentry;
2578+}
2579+
2580+struct dentry *ovl_dentry_real(struct dentry *dentry)
2581+{
2582+ struct ovl_entry *oe = dentry->d_fsdata;
2583+ struct dentry *realdentry;
2584+
2585+ realdentry = ovl_upperdentry_dereference(oe);
2586+ if (!realdentry)
2587+ realdentry = oe->lowerdentry;
2588+
2589+ return realdentry;
2590+}
2591+
2592+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
2593+{
2594+ struct dentry *realdentry;
2595+
2596+ realdentry = ovl_upperdentry_dereference(oe);
2597+ if (realdentry) {
2598+ *is_upper = true;
2599+ } else {
2600+ realdentry = oe->lowerdentry;
2601+ *is_upper = false;
2602+ }
2603+ return realdentry;
2604+}
2605+
2606+bool ovl_dentry_is_opaque(struct dentry *dentry)
2607+{
2608+ struct ovl_entry *oe = dentry->d_fsdata;
2609+ return oe->opaque;
2610+}
2611+
2612+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
2613+{
2614+ struct ovl_entry *oe = dentry->d_fsdata;
2615+ oe->opaque = opaque;
2616+}
2617+
2618+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
2619+{
2620+ struct ovl_entry *oe = dentry->d_fsdata;
2621+
2622+ WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
2623+ WARN_ON(oe->__upperdentry);
2624+ BUG_ON(!upperdentry->d_inode);
2625+ smp_wmb();
2626+ oe->__upperdentry = dget(upperdentry);
2627+}
2628+
2629+void ovl_dentry_version_inc(struct dentry *dentry)
2630+{
2631+ struct ovl_entry *oe = dentry->d_fsdata;
2632+
2633+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2634+ oe->version++;
2635+}
2636+
2637+u64 ovl_dentry_version_get(struct dentry *dentry)
2638+{
2639+ struct ovl_entry *oe = dentry->d_fsdata;
2640+
2641+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2642+ return oe->version;
2643+}
2644+
2645+bool ovl_is_whiteout(struct dentry *dentry)
2646+{
2647+ int res;
2648+ char val;
2649+
2650+ if (!dentry)
2651+ return false;
2652+ if (!dentry->d_inode)
2653+ return false;
2654+ if (!S_ISLNK(dentry->d_inode->i_mode))
2655+ return false;
2656+
2657+ res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1);
2658+ if (res == 1 && val == 'y')
2659+ return true;
2660+
2661+ return false;
2662+}
2663+
2664+static bool ovl_is_opaquedir(struct dentry *dentry)
2665+{
2666+ int res;
2667+ char val;
2668+
2669+ if (!S_ISDIR(dentry->d_inode->i_mode))
2670+ return false;
2671+
2672+ res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1);
2673+ if (res == 1 && val == 'y')
2674+ return true;
2675+
2676+ return false;
2677+}
2678+
2679+static void ovl_entry_free(struct rcu_head *head)
2680+{
2681+ struct ovl_entry *oe = container_of(head, struct ovl_entry, rcu);
2682+ kfree(oe);
2683+}
2684+
2685+static void ovl_dentry_release(struct dentry *dentry)
2686+{
2687+ struct ovl_entry *oe = dentry->d_fsdata;
2688+
2689+ if (oe) {
2690+ dput(oe->__upperdentry);
2691+ dput(oe->__upperdentry);
2692+ dput(oe->lowerdentry);
2693+ call_rcu(&oe->rcu, ovl_entry_free);
2694+ }
2695+}
2696+
2697+const struct dentry_operations ovl_dentry_operations = {
2698+ .d_release = ovl_dentry_release,
2699+};
2700+
2701+static struct ovl_entry *ovl_alloc_entry(void)
2702+{
2703+ return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
2704+}
2705+
2706+static inline struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name)
2707+{
2708+ struct dentry *dentry;
2709+
2710+ mutex_lock(&dir->d_inode->i_mutex);
2711+ dentry = lookup_one_len(name->name, dir, name->len);
2712+ mutex_unlock(&dir->d_inode->i_mutex);
2713+
2714+ if (IS_ERR(dentry)) {
2715+ if (PTR_ERR(dentry) == -ENOENT)
2716+ dentry = NULL;
2717+ } else if (!dentry->d_inode) {
2718+ dput(dentry);
2719+ dentry = NULL;
2720+ }
2721+ return dentry;
2722+}
2723+
2724+static int ovl_do_lookup(struct dentry *dentry)
2725+{
2726+ struct ovl_entry *oe;
2727+ struct dentry *upperdir;
2728+ struct dentry *lowerdir;
2729+ struct dentry *upperdentry = NULL;
2730+ struct dentry *lowerdentry = NULL;
2731+ struct inode *inode = NULL;
2732+ int err;
2733+
2734+ err = -ENOMEM;
2735+ oe = ovl_alloc_entry();
2736+ if (!oe)
2737+ goto out;
2738+
2739+ upperdir = ovl_dentry_upper(dentry->d_parent);
2740+ lowerdir = ovl_dentry_lower(dentry->d_parent);
2741+
2742+ if (upperdir) {
2743+ upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
2744+ err = PTR_ERR(upperdentry);
2745+ if (IS_ERR(upperdentry))
2746+ goto out_put_dir;
2747+
2748+ if (lowerdir && upperdentry &&
2749+ (S_ISLNK(upperdentry->d_inode->i_mode) ||
2750+ S_ISDIR(upperdentry->d_inode->i_mode))) {
2751+ const struct cred *old_cred;
2752+ struct cred *override_cred;
2753+
2754+ err = -ENOMEM;
2755+ override_cred = prepare_creds();
2756+ if (!override_cred)
2757+ goto out_dput_upper;
2758+
2759+ /* CAP_SYS_ADMIN needed for getxattr */
2760+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2761+ old_cred = override_creds(override_cred);
2762+
2763+ if (ovl_is_opaquedir(upperdentry)) {
2764+ oe->opaque = true;
2765+ } else if (ovl_is_whiteout(upperdentry)) {
2766+ dput(upperdentry);
2767+ upperdentry = NULL;
2768+ oe->opaque = true;
2769+ }
2770+ revert_creds(old_cred);
2771+ put_cred(override_cred);
2772+ }
2773+ }
2774+ if (lowerdir && !oe->opaque) {
2775+ lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
2776+ err = PTR_ERR(lowerdentry);
2777+ if (IS_ERR(lowerdentry))
2778+ goto out_dput_upper;
2779+ }
2780+
2781+ if (lowerdentry && upperdentry &&
2782+ (!S_ISDIR(upperdentry->d_inode->i_mode) ||
2783+ !S_ISDIR(lowerdentry->d_inode->i_mode))) {
2784+ dput(lowerdentry);
2785+ lowerdentry = NULL;
2786+ oe->opaque = true;
2787+ }
2788+
2789+ if (lowerdentry || upperdentry) {
2790+ struct dentry *realdentry;
2791+
2792+ realdentry = upperdentry ? upperdentry : lowerdentry;
2793+ err = -ENOMEM;
2794+ inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, oe);
2795+ if (!inode)
2796+ goto out_dput;
2797+ }
2798+
2799+ if (upperdentry)
2800+ oe->__upperdentry = dget(upperdentry);
2801+
2802+ if (lowerdentry)
2803+ oe->lowerdentry = lowerdentry;
2804+
2805+ dentry->d_fsdata = oe;
2806+ dentry->d_op = &ovl_dentry_operations;
2807+ d_add(dentry, inode);
2808+
2809+ return 0;
2810+
2811+out_dput:
2812+ dput(lowerdentry);
2813+out_dput_upper:
2814+ dput(upperdentry);
2815+out_put_dir:
2816+ kfree(oe);
2817+out:
2818+ return err;
2819+}
2820+
2821+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
2822+ struct nameidata *nd)
2823+{
2824+ int err = ovl_do_lookup(dentry);
2825+
2826+ if (err)
2827+ return ERR_PTR(err);
2828+
2829+ return NULL;
2830+}
2831+
2832+static void ovl_put_super(struct super_block *sb)
2833+{
2834+ struct ovl_fs *ufs = sb->s_fs_info;
2835+
2836+ if (!(sb->s_flags & MS_RDONLY))
2837+ mnt_drop_write(ufs->upper_mnt);
2838+
2839+ mntput(ufs->upper_mnt);
2840+ mntput(ufs->lower_mnt);
2841+
2842+ kfree(ufs->config.lowerdir);
2843+ kfree(ufs->config.upperdir);
2844+ kfree(ufs);
2845+}
2846+
2847+static int ovl_remount_fs(struct super_block *sb, int *flagsp, char *data)
2848+{
2849+ int flags = *flagsp;
2850+ struct ovl_fs *ufs = sb->s_fs_info;
2851+
2852+ /* When remounting rw or ro, we need to adjust the write access to the
2853+ * upper fs.
2854+ */
2855+ if (((flags ^ sb->s_flags) & MS_RDONLY) == 0)
2856+ /* No change to readonly status */
2857+ return 0;
2858+
2859+ if (flags & MS_RDONLY) {
2860+ mnt_drop_write(ufs->upper_mnt);
2861+ return 0;
2862+ } else
2863+ return mnt_want_write(ufs->upper_mnt);
2864+}
2865+
2866+/**
2867+ * ovl_statfs
2868+ * @sb: The overlayfs super block
2869+ * @buf: The struct kstatfs to fill in with stats
2870+ *
2871+ * Get the filesystem statistics. As writes always target the upper layer
2872+ * filesystem pass the statfs to the same filesystem.
2873+ */
2874+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
2875+{
2876+ struct dentry *root_dentry = dentry->d_sb->s_root;
2877+ struct path path;
2878+ ovl_path_upper(root_dentry, &path);
2879+
2880+ if (!path.dentry->d_sb->s_op->statfs)
2881+ return -ENOSYS;
2882+ return path.dentry->d_sb->s_op->statfs(path.dentry, buf);
2883+}
2884+
2885+/**
2886+ * ovl_show_options
2887+ *
2888+ * Prints the mount options for a given superblock.
2889+ * Returns zero; does not fail.
2890+ */
2891+static int ovl_show_options(struct seq_file *m, struct vfsmount *mnt)
2892+{
2893+ struct super_block *sb = mnt->mnt_sb;
2894+ struct ovl_fs *ufs = sb->s_fs_info;
2895+
2896+ seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
2897+ seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
2898+ return 0;
2899+}
2900+
2901+static const struct super_operations ovl_super_operations = {
2902+ .put_super = ovl_put_super,
2903+ .remount_fs = ovl_remount_fs,
2904+ .statfs = ovl_statfs,
2905+ .show_options = ovl_show_options,
2906+};
2907+
2908+enum {
2909+ Opt_lowerdir,
2910+ Opt_upperdir,
2911+ Opt_err,
2912+};
2913+
2914+static const match_table_t ovl_tokens = {
2915+ {Opt_lowerdir, "lowerdir=%s"},
2916+ {Opt_upperdir, "upperdir=%s"},
2917+ {Opt_err, NULL}
2918+};
2919+
2920+static int ovl_parse_opt(char *opt, struct ovl_config *config)
2921+{
2922+ char *p;
2923+
2924+ config->upperdir = NULL;
2925+ config->lowerdir = NULL;
2926+
2927+ while ((p = strsep(&opt, ",")) != NULL) {
2928+ int token;
2929+ substring_t args[MAX_OPT_ARGS];
2930+
2931+ if (!*p)
2932+ continue;
2933+
2934+ token = match_token(p, ovl_tokens, args);
2935+ switch (token) {
2936+ case Opt_upperdir:
2937+ kfree(config->upperdir);
2938+ config->upperdir = match_strdup(&args[0]);
2939+ if (!config->upperdir)
2940+ return -ENOMEM;
2941+ break;
2942+
2943+ case Opt_lowerdir:
2944+ kfree(config->lowerdir);
2945+ config->lowerdir = match_strdup(&args[0]);
2946+ if (!config->lowerdir)
2947+ return -ENOMEM;
2948+ break;
2949+
2950+ default:
2951+ return -EINVAL;
2952+ }
2953+ }
2954+ return 0;
2955+}
2956+
2957+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
2958+{
2959+ struct path lowerpath;
2960+ struct path upperpath;
2961+ struct inode *root_inode;
2962+ struct dentry *root_dentry;
2963+ struct ovl_entry *oe;
2964+ struct ovl_fs *ufs;
2965+ int err;
2966+
2967+ err = -ENOMEM;
2968+ ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL);
2969+ if (!ufs)
2970+ goto out;
2971+
2972+ err = ovl_parse_opt((char *) data, &ufs->config);
2973+ if (err)
2974+ goto out_free_ufs;
2975+
2976+ err = -EINVAL;
2977+ if (!ufs->config.upperdir || !ufs->config.lowerdir) {
2978+ printk(KERN_ERR "overlayfs: missing upperdir or lowerdir\n");
2979+ goto out_free_config;
2980+ }
2981+
2982+ oe = ovl_alloc_entry();
2983+ if (oe == NULL)
2984+ goto out_free_config;
2985+
2986+ root_inode = ovl_new_inode(sb, S_IFDIR, oe);
2987+ if (!root_inode)
2988+ goto out_free_oe;
2989+
2990+ err = kern_path(ufs->config.upperdir, LOOKUP_FOLLOW, &upperpath);
2991+ if (err)
2992+ goto out_put_root;
2993+
2994+ err = kern_path(ufs->config.lowerdir, LOOKUP_FOLLOW, &lowerpath);
2995+ if (err)
2996+ goto out_put_upperpath;
2997+
2998+ err = -ENOTDIR;
2999+ if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
3000+ !S_ISDIR(lowerpath.dentry->d_inode->i_mode))
3001+ goto out_put_lowerpath;
3002+
3003+ sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
3004+ lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
3005+
3006+ err = -EINVAL;
3007+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
3008+ printk(KERN_ERR "overlayfs: maximum fs stacking depth exceeded\n");
3009+ goto out_put_lowerpath;
3010+ }
3011+
3012+
3013+ ufs->upper_mnt = clone_private_mount(&upperpath);
3014+ err = PTR_ERR(ufs->upper_mnt);
3015+ if (IS_ERR(ufs->upper_mnt)) {
3016+ printk(KERN_ERR "overlayfs: failed to clone upperpath\n");
3017+ goto out_put_lowerpath;
3018+ }
3019+
3020+ ufs->lower_mnt = clone_private_mount(&lowerpath);
3021+ err = PTR_ERR(ufs->lower_mnt);
3022+ if (IS_ERR(ufs->lower_mnt)) {
3023+ printk(KERN_ERR "overlayfs: failed to clone lowerpath\n");
3024+ goto out_put_upper_mnt;
3025+ }
3026+
3027+ /*
3028+ * Make lower_mnt R/O. That way fchmod/fchown on lower file
3029+ * will fail instead of modifying lower fs.
3030+ */
3031+ ufs->lower_mnt->mnt_flags |= MNT_READONLY;
3032+
3033+ /* If the upper fs is r/o, we mark overlayfs r/o too */
3034+ if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
3035+ sb->s_flags |= MS_RDONLY;
3036+
3037+ if (!(sb->s_flags & MS_RDONLY)) {
3038+ err = mnt_want_write(ufs->upper_mnt);
3039+ if (err)
3040+ goto out_put_lower_mnt;
3041+ }
3042+
3043+ err = -ENOMEM;
3044+ root_dentry = d_alloc_root(root_inode);
3045+ if (!root_dentry)
3046+ goto out_drop_write;
3047+
3048+ mntput(upperpath.mnt);
3049+ mntput(lowerpath.mnt);
3050+
3051+ oe->__upperdentry = dget(upperpath.dentry);
3052+ oe->lowerdentry = lowerpath.dentry;
3053+
3054+ root_dentry->d_fsdata = oe;
3055+ root_dentry->d_op = &ovl_dentry_operations;
3056+
3057+ sb->s_op = &ovl_super_operations;
3058+ sb->s_root = root_dentry;
3059+ sb->s_fs_info = ufs;
3060+
3061+ return 0;
3062+
3063+out_drop_write:
3064+ if (!(sb->s_flags & MS_RDONLY))
3065+ mnt_drop_write(ufs->upper_mnt);
3066+out_put_lower_mnt:
3067+ mntput(ufs->lower_mnt);
3068+out_put_upper_mnt:
3069+ mntput(ufs->upper_mnt);
3070+out_put_lowerpath:
3071+ path_put(&lowerpath);
3072+out_put_upperpath:
3073+ path_put(&upperpath);
3074+out_put_root:
3075+ iput(root_inode);
3076+out_free_oe:
3077+ kfree(oe);
3078+out_free_config:
3079+ kfree(ufs->config.lowerdir);
3080+ kfree(ufs->config.upperdir);
3081+out_free_ufs:
3082+ kfree(ufs);
3083+out:
3084+ return err;
3085+}
3086+
3087+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
3088+ const char *dev_name, void *raw_data)
3089+{
3090+ return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
3091+}
3092+
3093+static struct file_system_type ovl_fs_type = {
3094+ .owner = THIS_MODULE,
3095+ .name = "overlayfs",
3096+ .mount = ovl_mount,
3097+ .kill_sb = kill_anon_super,
3098+};
3099+
3100+static int __init ovl_init(void)
3101+{
3102+ return register_filesystem(&ovl_fs_type);
3103+}
3104+
3105+static void __exit ovl_exit(void)
3106+{
3107+ unregister_filesystem(&ovl_fs_type);
3108+}
3109+
3110+module_init(ovl_init);
3111+module_exit(ovl_exit);
3112--- a/fs/splice.c
3113+++ b/fs/splice.c
3114@@ -1300,6 +1300,7 @@ long do_splice_direct(struct file *in, l
3115 
3116     return ret;
3117 }
3118+EXPORT_SYMBOL(do_splice_direct);
3119 
3120 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
3121                    struct pipe_inode_info *opipe,
3122--- a/include/linux/fs.h
3123+++ b/include/linux/fs.h
3124@@ -483,6 +483,12 @@ struct iattr {
3125  */
3126 #include <linux/quota.h>
3127 
3128+/*
3129+ * Maximum number of layers of fs stack. Needs to be limited to
3130+ * prevent kernel stack overflow
3131+ */
3132+#define FILESYSTEM_MAX_STACK_DEPTH 2
3133+
3134 /**
3135  * enum positive_aop_returns - aop return codes with specific semantics
3136  *
3137@@ -1463,6 +1469,11 @@ struct super_block {
3138     int cleancache_poolid;
3139 
3140     struct shrinker s_shrink; /* per-sb shrinker handle */
3141+
3142+ /*
3143+ * Indicates how deep in a filesystem stack this SB is
3144+ */
3145+ int s_stack_depth;
3146 };
3147 
3148 /* superblock cache pruning functions */
3149@@ -1620,6 +1631,7 @@ struct inode_operations {
3150     void (*truncate_range)(struct inode *, loff_t, loff_t);
3151     int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
3152               u64 len);
3153+ struct file *(*open)(struct dentry *, int flags, const struct cred *);
3154 } ____cacheline_aligned;
3155 
3156 struct seq_file;
3157@@ -2024,6 +2036,7 @@ extern long do_sys_open(int dfd, const c
3158 extern struct file *filp_open(const char *, int, int);
3159 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
3160                    const char *, int);
3161+extern struct file *vfs_open(struct path *, int flags, const struct cred *);
3162 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
3163                  const struct cred *);
3164 extern int filp_close(struct file *, fl_owner_t id);
3165--- a/include/linux/mount.h
3166+++ b/include/linux/mount.h
3167@@ -100,6 +100,9 @@ extern void mnt_pin(struct vfsmount *mnt
3168 extern void mnt_unpin(struct vfsmount *mnt);
3169 extern int __mnt_is_readonly(struct vfsmount *mnt);
3170 
3171+struct path;
3172+extern struct vfsmount *clone_private_mount(struct path *path);
3173+
3174 extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
3175                       const char *name, void *data);
3176 
3177

Archive Download this file



interactive