Root/target/linux/generic/patches-3.3/100-overlayfs_v12.patch

1--- a/Documentation/filesystems/Locking
2+++ b/Documentation/filesystems/Locking
3@@ -62,6 +62,7 @@ ata *);
4     int (*removexattr) (struct dentry *, const char *);
5     void (*truncate_range)(struct inode *, loff_t, loff_t);
6     int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
7+ struct file *(*open)(struct dentry *,struct file *,const struct cred *);
8 
9 locking rules:
10     all may block
11@@ -89,6 +90,7 @@ listxattr: no
12 removexattr: yes
13 truncate_range: yes
14 fiemap: no
15+open: no
16     Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
17 victim.
18     cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
19--- /dev/null
20+++ b/Documentation/filesystems/overlayfs.txt
21@@ -0,0 +1,199 @@
22+Written by: Neil Brown <neilb@suse.de>
23+
24+Overlay Filesystem
25+==================
26+
27+This document describes a prototype for a new approach to providing
28+overlay-filesystem functionality in Linux (sometimes referred to as
29+union-filesystems). An overlay-filesystem tries to present a
30+filesystem which is the result over overlaying one filesystem on top
31+of the other.
32+
33+The result will inevitably fail to look exactly like a normal
34+filesystem for various technical reasons. The expectation is that
35+many use cases will be able to ignore these differences.
36+
37+This approach is 'hybrid' because the objects that appear in the
38+filesystem do not all appear to belong to that filesystem. In many
39+cases an object accessed in the union will be indistinguishable
40+from accessing the corresponding object from the original filesystem.
41+This is most obvious from the 'st_dev' field returned by stat(2).
42+
43+While directories will report an st_dev from the overlay-filesystem,
44+all non-directory objects will report an st_dev from the lower or
45+upper filesystem that is providing the object. Similarly st_ino will
46+only be unique when combined with st_dev, and both of these can change
47+over the lifetime of a non-directory object. Many applications and
48+tools ignore these values and will not be affected.
49+
50+Upper and Lower
51+---------------
52+
53+An overlay filesystem combines two filesystems - an 'upper' filesystem
54+and a 'lower' filesystem. When a name exists in both filesystems, the
55+object in the 'upper' filesystem is visible while the object in the
56+'lower' filesystem is either hidden or, in the case of directories,
57+merged with the 'upper' object.
58+
59+It would be more correct to refer to an upper and lower 'directory
60+tree' rather than 'filesystem' as it is quite possible for both
61+directory trees to be in the same filesystem and there is no
62+requirement that the root of a filesystem be given for either upper or
63+lower.
64+
65+The lower filesystem can be any filesystem supported by Linux and does
66+not need to be writable. The lower filesystem can even be another
67+overlayfs. The upper filesystem will normally be writable and if it
68+is it must support the creation of trusted.* extended attributes, and
69+must provide valid d_type in readdir responses, at least for symbolic
70+links - so NFS is not suitable.
71+
72+A read-only overlay of two read-only filesystems may use any
73+filesystem type.
74+
75+Directories
76+-----------
77+
78+Overlaying mainly involved directories. If a given name appears in both
79+upper and lower filesystems and refers to a non-directory in either,
80+then the lower object is hidden - the name refers only to the upper
81+object.
82+
83+Where both upper and lower objects are directories, a merged directory
84+is formed.
85+
86+At mount time, the two directories given as mount options are combined
87+into a merged directory:
88+
89+ mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper /overlay
90+
91+Then whenever a lookup is requested in such a merged directory, the
92+lookup is performed in each actual directory and the combined result
93+is cached in the dentry belonging to the overlay filesystem. If both
94+actual lookups find directories, both are stored and a merged
95+directory is created, otherwise only one is stored: the upper if it
96+exists, else the lower.
97+
98+Only the lists of names from directories are merged. Other content
99+such as metadata and extended attributes are reported for the upper
100+directory only. These attributes of the lower directory are hidden.
101+
102+whiteouts and opaque directories
103+--------------------------------
104+
105+In order to support rm and rmdir without changing the lower
106+filesystem, an overlay filesystem needs to record in the upper filesystem
107+that files have been removed. This is done using whiteouts and opaque
108+directories (non-directories are always opaque).
109+
110+The overlay filesystem uses extended attributes with a
111+"trusted.overlay." prefix to record these details.
112+
113+A whiteout is created as a symbolic link with target
114+"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y".
115+When a whiteout is found in the upper level of a merged directory, any
116+matching name in the lower level is ignored, and the whiteout itself
117+is also hidden.
118+
119+A directory is made opaque by setting the xattr "trusted.overlay.opaque"
120+to "y". Where the upper filesystem contains an opaque directory, any
121+directory in the lower filesystem with the same name is ignored.
122+
123+readdir
124+-------
125+
126+When a 'readdir' request is made on a merged directory, the upper and
127+lower directories are each read and the name lists merged in the
128+obvious way (upper is read first, then lower - entries that already
129+exist are not re-added). This merged name list is cached in the
130+'struct file' and so remains as long as the file is kept open. If the
131+directory is opened and read by two processes at the same time, they
132+will each have separate caches. A seekdir to the start of the
133+directory (offset 0) followed by a readdir will cause the cache to be
134+discarded and rebuilt.
135+
136+This means that changes to the merged directory do not appear while a
137+directory is being read. This is unlikely to be noticed by many
138+programs.
139+
140+seek offsets are assigned sequentially when the directories are read.
141+Thus if
142+ - read part of a directory
143+ - remember an offset, and close the directory
144+ - re-open the directory some time later
145+ - seek to the remembered offset
146+
147+there may be little correlation between the old and new locations in
148+the list of filenames, particularly if anything has changed in the
149+directory.
150+
151+Readdir on directories that are not merged is simply handled by the
152+underlying directory (upper or lower).
153+
154+
155+Non-directories
156+---------------
157+
158+Objects that are not directories (files, symlinks, device-special
159+files etc.) are presented either from the upper or lower filesystem as
160+appropriate. When a file in the lower filesystem is accessed in a way
161+the requires write-access, such as opening for write access, changing
162+some metadata etc., the file is first copied from the lower filesystem
163+to the upper filesystem (copy_up). Note that creating a hard-link
164+also requires copy_up, though of course creation of a symlink does
165+not.
166+
167+The copy_up may turn out to be unnecessary, for example if the file is
168+opened for read-write but the data is not modified.
169+
170+The copy_up process first makes sure that the containing directory
171+exists in the upper filesystem - creating it and any parents as
172+necessary. It then creates the object with the same metadata (owner,
173+mode, mtime, symlink-target etc.) and then if the object is a file, the
174+data is copied from the lower to the upper filesystem. Finally any
175+extended attributes are copied up.
176+
177+Once the copy_up is complete, the overlay filesystem simply
178+provides direct access to the newly created file in the upper
179+filesystem - future operations on the file are barely noticed by the
180+overlay filesystem (though an operation on the name of the file such as
181+rename or unlink will of course be noticed and handled).
182+
183+
184+Non-standard behavior
185+---------------------
186+
187+The copy_up operation essentially creates a new, identical file and
188+moves it over to the old name. The new file may be on a different
189+filesystem, so both st_dev and st_ino of the file may change.
190+
191+Any open files referring to this inode will access the old data and
192+metadata. Similarly any file locks obtained before copy_up will not
193+apply to the copied up file.
194+
195+On a file is opened with O_RDONLY fchmod(2), fchown(2), futimesat(2)
196+and fsetxattr(2) will fail with EROFS.
197+
198+If a file with multiple hard links is copied up, then this will
199+"break" the link. Changes will not be propagated to other names
200+referring to the same inode.
201+
202+Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory
203+object in overlayfs will not contain vaid absolute paths, only
204+relative paths leading up to the filesystem's root. This will be
205+fixed in the future.
206+
207+Some operations are not atomic, for example a crash during copy_up or
208+rename will leave the filesystem in an inconsitent state. This will
209+be addressed in the future.
210+
211+Changes to underlying filesystems
212+---------------------------------
213+
214+Offline changes, when the overlay is not mounted, are allowed to either
215+the upper or the lower trees.
216+
217+Changes to the underlying filesystems while part of a mounted overlay
218+filesystem are not allowed. If the underlying filesystem is changed,
219+the behavior of the overlay is undefined, though it will not result in
220+a crash or deadlock.
221--- a/Documentation/filesystems/vfs.txt
222+++ b/Documentation/filesystems/vfs.txt
223@@ -364,6 +364,8 @@ struct inode_operations {
224     ssize_t (*listxattr) (struct dentry *, char *, size_t);
225     int (*removexattr) (struct dentry *, const char *);
226     void (*truncate_range)(struct inode *, loff_t, loff_t);
227+ struct file *(*open) (struct dentry *, struct file *,
228+ const struct cred *);
229 };
230 
231 Again, all methods are called without any locks being held, unless
232@@ -475,6 +477,12 @@ otherwise noted.
233   truncate_range: a method provided by the underlying filesystem to truncate a
234       range of blocks , i.e. punch a hole somewhere in a file.
235 
236+ open: this is an alternative to f_op->open(), the difference is that this
237+ method may return any open file, not necessarily originating from the
238+ same filesystem as the one i_op->open() was called on. It may be useful
239+ for stacking filesystems which want to allow native I/O directly on
240+ underlying files.
241+
242 
243 The Address Space Object
244 ========================
245--- a/MAINTAINERS
246+++ b/MAINTAINERS
247@@ -4928,6 +4928,13 @@ F: drivers/scsi/osd/
248 F: include/scsi/osd_*
249 F: fs/exofs/
250 
251+OVERLAYFS FILESYSTEM
252+M: Miklos Szeredi <miklos@szeredi.hu>
253+L: linux-fsdevel@vger.kernel.org
254+S: Supported
255+F: fs/overlayfs/*
256+F: Documentation/filesystems/overlayfs.txt
257+
258 P54 WIRELESS DRIVER
259 M: Christian Lamparter <chunkeey@googlemail.com>
260 L: linux-wireless@vger.kernel.org
261--- a/fs/Kconfig
262+++ b/fs/Kconfig
263@@ -63,6 +63,7 @@ source "fs/quota/Kconfig"
264 
265 source "fs/autofs4/Kconfig"
266 source "fs/fuse/Kconfig"
267+source "fs/overlayfs/Kconfig"
268 
269 config CUSE
270     tristate "Character device in Userspace support"
271--- a/fs/Makefile
272+++ b/fs/Makefile
273@@ -105,6 +105,7 @@ obj-$(CONFIG_QNX4FS_FS) += qnx4/
274 obj-$(CONFIG_AUTOFS4_FS) += autofs4/
275 obj-$(CONFIG_ADFS_FS) += adfs/
276 obj-$(CONFIG_FUSE_FS) += fuse/
277+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
278 obj-$(CONFIG_UDF_FS) += udf/
279 obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
280 obj-$(CONFIG_OMFS_FS) += omfs/
281--- a/fs/ecryptfs/main.c
282+++ b/fs/ecryptfs/main.c
283@@ -544,6 +544,13 @@ static struct dentry *ecryptfs_mount(str
284     s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
285     s->s_blocksize = path.dentry->d_sb->s_blocksize;
286     s->s_magic = ECRYPTFS_SUPER_MAGIC;
287+ s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
288+
289+ rc = -EINVAL;
290+ if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
291+ printk(KERN_ERR "eCryptfs: maximum fs stacking depth exceeded\n");
292+ goto out_free;
293+ }
294 
295     inode = ecryptfs_get_inode(path.dentry->d_inode, s);
296     rc = PTR_ERR(inode);
297--- a/fs/namespace.c
298+++ b/fs/namespace.c
299@@ -1325,6 +1325,24 @@ void drop_collected_mounts(struct vfsmou
300     release_mounts(&umount_list);
301 }
302 
303+struct vfsmount *clone_private_mount(struct path *path)
304+{
305+ struct mount *old_mnt = real_mount(path->mnt);
306+ struct mount *new_mnt;
307+
308+ if (IS_MNT_UNBINDABLE(old_mnt))
309+ return ERR_PTR(-EINVAL);
310+
311+ down_read(&namespace_sem);
312+ new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
313+ up_read(&namespace_sem);
314+ if (!new_mnt)
315+ return ERR_PTR(-ENOMEM);
316+
317+ return &new_mnt->mnt;
318+}
319+EXPORT_SYMBOL_GPL(clone_private_mount);
320+
321 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
322            struct vfsmount *root)
323 {
324--- a/fs/open.c
325+++ b/fs/open.c
326@@ -644,24 +644,24 @@ static inline int __get_file_write_acces
327     return error;
328 }
329 
330-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
331- struct file *f,
332- int (*open)(struct inode *, struct file *),
333- const struct cred *cred)
334+static struct file *__dentry_open(struct path *path, struct file *f,
335+ int (*open)(struct inode *, struct file *),
336+ const struct cred *cred)
337 {
338     static const struct file_operations empty_fops = {};
339     struct inode *inode;
340     int error;
341 
342+ path_get(path);
343     f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
344                 FMODE_PREAD | FMODE_PWRITE;
345 
346     if (unlikely(f->f_flags & O_PATH))
347         f->f_mode = FMODE_PATH;
348 
349- inode = dentry->d_inode;
350+ inode = path->dentry->d_inode;
351     if (f->f_mode & FMODE_WRITE) {
352- error = __get_file_write_access(inode, mnt);
353+ error = __get_file_write_access(inode, path->mnt);
354         if (error)
355             goto cleanup_file;
356         if (!special_file(inode->i_mode))
357@@ -669,8 +669,7 @@ static struct file *__dentry_open(struct
358     }
359 
360     f->f_mapping = inode->i_mapping;
361- f->f_path.dentry = dentry;
362- f->f_path.mnt = mnt;
363+ f->f_path = *path;
364     f->f_pos = 0;
365     file_sb_list_add(f, inode->i_sb);
366 
367@@ -727,7 +726,7 @@ cleanup_all:
368              * here, so just reset the state.
369              */
370             file_reset_write(f);
371- mnt_drop_write(mnt);
372+ mnt_drop_write(path->mnt);
373         }
374     }
375     file_sb_list_del(f);
376@@ -735,8 +734,7 @@ cleanup_all:
377     f->f_path.mnt = NULL;
378 cleanup_file:
379     put_filp(f);
380- dput(dentry);
381- mntput(mnt);
382+ path_put(path);
383     return ERR_PTR(error);
384 }
385 
386@@ -762,14 +760,14 @@ cleanup_file:
387 struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
388         int (*open)(struct inode *, struct file *))
389 {
390+ struct path path = { .dentry = dentry, .mnt = nd->path.mnt };
391     const struct cred *cred = current_cred();
392 
393     if (IS_ERR(nd->intent.open.file))
394         goto out;
395     if (IS_ERR(dentry))
396         goto out_err;
397- nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
398- nd->intent.open.file,
399+ nd->intent.open.file = __dentry_open(&path, nd->intent.open.file,
400                          open, cred);
401 out:
402     return nd->intent.open.file;
403@@ -797,11 +795,9 @@ struct file *nameidata_to_filp(struct na
404     nd->intent.open.file = NULL;
405 
406     /* Has the filesystem initialised the file for us? */
407- if (filp->f_path.dentry == NULL) {
408- path_get(&nd->path);
409- filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
410- NULL, cred);
411- }
412+ if (filp->f_path.dentry == NULL)
413+ filp = vfs_open(&nd->path, filp, cred);
414+
415     return filp;
416 }
417 
418@@ -812,27 +808,48 @@ struct file *nameidata_to_filp(struct na
419 struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
420              const struct cred *cred)
421 {
422- int error;
423     struct file *f;
424+ struct file *ret;
425+ struct path path = { .dentry = dentry, .mnt = mnt };
426 
427     validate_creds(cred);
428 
429     /* We must always pass in a valid mount pointer. */
430     BUG_ON(!mnt);
431 
432- error = -ENFILE;
433+ ret = ERR_PTR(-ENFILE);
434     f = get_empty_filp();
435- if (f == NULL) {
436- dput(dentry);
437- mntput(mnt);
438- return ERR_PTR(error);
439+ if (f != NULL) {
440+ f->f_flags = flags;
441+ ret = vfs_open(&path, f, cred);
442     }
443+ path_put(&path);
444 
445- f->f_flags = flags;
446- return __dentry_open(dentry, mnt, f, NULL, cred);
447+ return ret;
448 }
449 EXPORT_SYMBOL(dentry_open);
450 
451+/**
452+ * vfs_open - open the file at the given path
453+ * @path: path to open
454+ * @filp: newly allocated file with f_flag initialized
455+ * @cred: credentials to use
456+ *
457+ * Open the file. If successful, the returned file will have acquired
458+ * an additional reference for path.
459+ */
460+struct file *vfs_open(struct path *path, struct file *filp,
461+ const struct cred *cred)
462+{
463+ struct inode *inode = path->dentry->d_inode;
464+
465+ if (inode->i_op->open)
466+ return inode->i_op->open(path->dentry, filp, cred);
467+ else
468+ return __dentry_open(path, filp, NULL, cred);
469+}
470+EXPORT_SYMBOL(vfs_open);
471+
472 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
473 {
474     struct fdtable *fdt = files_fdtable(files);
475--- /dev/null
476+++ b/fs/overlayfs/Kconfig
477@@ -0,0 +1,4 @@
478+config OVERLAYFS_FS
479+ tristate "Overlay filesystem support"
480+ help
481+ Add support for overlay filesystem.
482--- /dev/null
483+++ b/fs/overlayfs/Makefile
484@@ -0,0 +1,7 @@
485+#
486+# Makefile for the overlay filesystem.
487+#
488+
489+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
490+
491+overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
492--- /dev/null
493+++ b/fs/overlayfs/copy_up.c
494@@ -0,0 +1,384 @@
495+/*
496+ *
497+ * Copyright (C) 2011 Novell Inc.
498+ *
499+ * This program is free software; you can redistribute it and/or modify it
500+ * under the terms of the GNU General Public License version 2 as published by
501+ * the Free Software Foundation.
502+ */
503+
504+#include <linux/fs.h>
505+#include <linux/slab.h>
506+#include <linux/file.h>
507+#include <linux/splice.h>
508+#include <linux/xattr.h>
509+#include <linux/security.h>
510+#include <linux/uaccess.h>
511+#include "overlayfs.h"
512+
513+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
514+
515+static int ovl_copy_up_xattr(struct dentry *old, struct dentry *new)
516+{
517+ ssize_t list_size, size;
518+ char *buf, *name, *value;
519+ int error;
520+
521+ if (!old->d_inode->i_op->getxattr ||
522+ !new->d_inode->i_op->getxattr)
523+ return 0;
524+
525+ list_size = vfs_listxattr(old, NULL, 0);
526+ if (list_size <= 0) {
527+ if (list_size == -EOPNOTSUPP)
528+ return 0;
529+ return list_size;
530+ }
531+
532+ buf = kzalloc(list_size, GFP_KERNEL);
533+ if (!buf)
534+ return -ENOMEM;
535+
536+ error = -ENOMEM;
537+ value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
538+ if (!value)
539+ goto out;
540+
541+ list_size = vfs_listxattr(old, buf, list_size);
542+ if (list_size <= 0) {
543+ error = list_size;
544+ goto out_free_value;
545+ }
546+
547+ for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
548+ size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
549+ if (size <= 0) {
550+ error = size;
551+ goto out_free_value;
552+ }
553+ error = vfs_setxattr(new, name, value, size, 0);
554+ if (error)
555+ goto out_free_value;
556+ }
557+
558+out_free_value:
559+ kfree(value);
560+out:
561+ kfree(buf);
562+ return error;
563+}
564+
565+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
566+{
567+ struct file *old_file;
568+ struct file *new_file;
569+ int error = 0;
570+
571+ if (len == 0)
572+ return 0;
573+
574+ old_file = ovl_path_open(old, O_RDONLY);
575+ if (IS_ERR(old_file))
576+ return PTR_ERR(old_file);
577+
578+ new_file = ovl_path_open(new, O_WRONLY);
579+ if (IS_ERR(new_file)) {
580+ error = PTR_ERR(new_file);
581+ goto out_fput;
582+ }
583+
584+ /* FIXME: copy up sparse files efficiently */
585+ while (len) {
586+ loff_t offset = new_file->f_pos;
587+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
588+ long bytes;
589+
590+ if (len < this_len)
591+ this_len = len;
592+
593+ if (signal_pending_state(TASK_KILLABLE, current)) {
594+ error = -EINTR;
595+ break;
596+ }
597+
598+ bytes = do_splice_direct(old_file, &offset, new_file, this_len,
599+ SPLICE_F_MOVE);
600+ if (bytes <= 0) {
601+ error = bytes;
602+ break;
603+ }
604+
605+ len -= bytes;
606+ }
607+
608+ fput(new_file);
609+out_fput:
610+ fput(old_file);
611+ return error;
612+}
613+
614+static char *ovl_read_symlink(struct dentry *realdentry)
615+{
616+ int res;
617+ char *buf;
618+ struct inode *inode = realdentry->d_inode;
619+ mm_segment_t old_fs;
620+
621+ res = -EINVAL;
622+ if (!inode->i_op->readlink)
623+ goto err;
624+
625+ res = -ENOMEM;
626+ buf = (char *) __get_free_page(GFP_KERNEL);
627+ if (!buf)
628+ goto err;
629+
630+ old_fs = get_fs();
631+ set_fs(get_ds());
632+ /* The cast to a user pointer is valid due to the set_fs() */
633+ res = inode->i_op->readlink(realdentry,
634+ (char __user *)buf, PAGE_SIZE - 1);
635+ set_fs(old_fs);
636+ if (res < 0) {
637+ free_page((unsigned long) buf);
638+ goto err;
639+ }
640+ buf[res] = '\0';
641+
642+ return buf;
643+
644+err:
645+ return ERR_PTR(res);
646+}
647+
648+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
649+{
650+ struct iattr attr = {
651+ .ia_valid =
652+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
653+ .ia_atime = stat->atime,
654+ .ia_mtime = stat->mtime,
655+ };
656+
657+ return notify_change(upperdentry, &attr);
658+}
659+
660+static int ovl_set_mode(struct dentry *upperdentry, umode_t mode)
661+{
662+ struct iattr attr = {
663+ .ia_valid = ATTR_MODE,
664+ .ia_mode = mode,
665+ };
666+
667+ return notify_change(upperdentry, &attr);
668+}
669+
670+static int ovl_copy_up_locked(struct dentry *upperdir, struct dentry *dentry,
671+ struct path *lowerpath, struct kstat *stat,
672+ const char *link)
673+{
674+ int err;
675+ struct path newpath;
676+ umode_t mode = stat->mode;
677+
678+ /* Can't properly set mode on creation because of the umask */
679+ stat->mode &= S_IFMT;
680+
681+ ovl_path_upper(dentry, &newpath);
682+ WARN_ON(newpath.dentry);
683+ newpath.dentry = ovl_upper_create(upperdir, dentry, stat, link);
684+ if (IS_ERR(newpath.dentry))
685+ return PTR_ERR(newpath.dentry);
686+
687+ if (S_ISREG(stat->mode)) {
688+ err = ovl_copy_up_data(lowerpath, &newpath, stat->size);
689+ if (err)
690+ goto err_remove;
691+ }
692+
693+ err = ovl_copy_up_xattr(lowerpath->dentry, newpath.dentry);
694+ if (err)
695+ goto err_remove;
696+
697+ mutex_lock(&newpath.dentry->d_inode->i_mutex);
698+ if (!S_ISLNK(stat->mode))
699+ err = ovl_set_mode(newpath.dentry, mode);
700+ if (!err)
701+ err = ovl_set_timestamps(newpath.dentry, stat);
702+ mutex_unlock(&newpath.dentry->d_inode->i_mutex);
703+ if (err)
704+ goto err_remove;
705+
706+ ovl_dentry_update(dentry, newpath.dentry);
707+
708+ /*
709+ * Easiest way to get rid of the lower dentry reference is to
710+ * drop this dentry. This is neither needed nor possible for
711+ * directories.
712+ */
713+ if (!S_ISDIR(stat->mode))
714+ d_drop(dentry);
715+
716+ return 0;
717+
718+err_remove:
719+ if (S_ISDIR(stat->mode))
720+ vfs_rmdir(upperdir->d_inode, newpath.dentry);
721+ else
722+ vfs_unlink(upperdir->d_inode, newpath.dentry);
723+
724+ dput(newpath.dentry);
725+
726+ return err;
727+}
728+
729+/*
730+ * Copy up a single dentry
731+ *
732+ * Directory renames only allowed on "pure upper" (already created on
733+ * upper filesystem, never copied up). Directories which are on lower or
734+ * are merged may not be renamed. For these -EXDEV is returned and
735+ * userspace has to deal with it. This means, when copying up a
736+ * directory we can rely on it and ancestors being stable.
737+ *
738+ * Non-directory renames start with copy up of source if necessary. The
739+ * actual rename will only proceed once the copy up was successful. Copy
740+ * up uses upper parent i_mutex for exclusion. Since rename can change
741+ * d_parent it is possible that the copy up will lock the old parent. At
742+ * that point the file will have already been copied up anyway.
743+ */
744+static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
745+ struct path *lowerpath, struct kstat *stat)
746+{
747+ int err;
748+ struct kstat pstat;
749+ struct path parentpath;
750+ struct dentry *upperdir;
751+ const struct cred *old_cred;
752+ struct cred *override_cred;
753+ char *link = NULL;
754+
755+ ovl_path_upper(parent, &parentpath);
756+ upperdir = parentpath.dentry;
757+
758+ err = vfs_getattr(parentpath.mnt, parentpath.dentry, &pstat);
759+ if (err)
760+ return err;
761+
762+ if (S_ISLNK(stat->mode)) {
763+ link = ovl_read_symlink(lowerpath->dentry);
764+ if (IS_ERR(link))
765+ return PTR_ERR(link);
766+ }
767+
768+ err = -ENOMEM;
769+ override_cred = prepare_creds();
770+ if (!override_cred)
771+ goto out_free_link;
772+
773+ override_cred->fsuid = stat->uid;
774+ override_cred->fsgid = stat->gid;
775+ /*
776+ * CAP_SYS_ADMIN for copying up extended attributes
777+ * CAP_DAC_OVERRIDE for create
778+ * CAP_FOWNER for chmod, timestamp update
779+ * CAP_FSETID for chmod
780+ * CAP_MKNOD for mknod
781+ */
782+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
783+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
784+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
785+ cap_raise(override_cred->cap_effective, CAP_FSETID);
786+ cap_raise(override_cred->cap_effective, CAP_MKNOD);
787+ old_cred = override_creds(override_cred);
788+
789+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
790+ if (ovl_path_type(dentry) != OVL_PATH_LOWER) {
791+ err = 0;
792+ } else {
793+ err = ovl_copy_up_locked(upperdir, dentry, lowerpath,
794+ stat, link);
795+ if (!err) {
796+ /* Restore timestamps on parent (best effort) */
797+ ovl_set_timestamps(upperdir, &pstat);
798+ }
799+ }
800+
801+ mutex_unlock(&upperdir->d_inode->i_mutex);
802+
803+ revert_creds(old_cred);
804+ put_cred(override_cred);
805+
806+out_free_link:
807+ if (link)
808+ free_page((unsigned long) link);
809+
810+ return err;
811+}
812+
813+int ovl_copy_up(struct dentry *dentry)
814+{
815+ int err;
816+
817+ err = 0;
818+ while (!err) {
819+ struct dentry *next;
820+ struct dentry *parent;
821+ struct path lowerpath;
822+ struct kstat stat;
823+ enum ovl_path_type type = ovl_path_type(dentry);
824+
825+ if (type != OVL_PATH_LOWER)
826+ break;
827+
828+ next = dget(dentry);
829+ /* find the topmost dentry not yet copied up */
830+ for (;;) {
831+ parent = dget_parent(next);
832+
833+ type = ovl_path_type(parent);
834+ if (type != OVL_PATH_LOWER)
835+ break;
836+
837+ dput(next);
838+ next = parent;
839+ }
840+
841+ ovl_path_lower(next, &lowerpath);
842+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
843+ if (!err)
844+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
845+
846+ dput(parent);
847+ dput(next);
848+ }
849+
850+ return err;
851+}
852+
853+/* Optimize by not copying up the file first and truncating later */
854+int ovl_copy_up_truncate(struct dentry *dentry, loff_t size)
855+{
856+ int err;
857+ struct kstat stat;
858+ struct path lowerpath;
859+ struct dentry *parent = dget_parent(dentry);
860+
861+ err = ovl_copy_up(parent);
862+ if (err)
863+ goto out_dput_parent;
864+
865+ ovl_path_lower(dentry, &lowerpath);
866+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
867+ if (err)
868+ goto out_dput_parent;
869+
870+ if (size < stat.size)
871+ stat.size = size;
872+
873+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
874+
875+out_dput_parent:
876+ dput(parent);
877+ return err;
878+}
879--- /dev/null
880+++ b/fs/overlayfs/dir.c
881@@ -0,0 +1,596 @@
882+/*
883+ *
884+ * Copyright (C) 2011 Novell Inc.
885+ *
886+ * This program is free software; you can redistribute it and/or modify it
887+ * under the terms of the GNU General Public License version 2 as published by
888+ * the Free Software Foundation.
889+ */
890+
891+#include <linux/fs.h>
892+#include <linux/namei.h>
893+#include <linux/xattr.h>
894+#include <linux/security.h>
895+#include "overlayfs.h"
896+
897+static const char *ovl_whiteout_symlink = "(overlay-whiteout)";
898+
899+static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry)
900+{
901+ int err;
902+ struct dentry *newdentry;
903+ const struct cred *old_cred;
904+ struct cred *override_cred;
905+
906+ /* FIXME: recheck lower dentry to see if whiteout is really needed */
907+
908+ err = -ENOMEM;
909+ override_cred = prepare_creds();
910+ if (!override_cred)
911+ goto out;
912+
913+ /*
914+ * CAP_SYS_ADMIN for setxattr
915+ * CAP_DAC_OVERRIDE for symlink creation
916+ * CAP_FOWNER for unlink in sticky directory
917+ */
918+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
919+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
920+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
921+ override_cred->fsuid = 0;
922+ override_cred->fsgid = 0;
923+ old_cred = override_creds(override_cred);
924+
925+ newdentry = lookup_one_len(dentry->d_name.name, upperdir,
926+ dentry->d_name.len);
927+ err = PTR_ERR(newdentry);
928+ if (IS_ERR(newdentry))
929+ goto out_put_cred;
930+
931+ /* Just been removed within the same locked region */
932+ WARN_ON(newdentry->d_inode);
933+
934+ err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink);
935+ if (err)
936+ goto out_dput;
937+
938+ ovl_dentry_version_inc(dentry->d_parent);
939+
940+ err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0);
941+ if (err)
942+ vfs_unlink(upperdir->d_inode, newdentry);
943+
944+out_dput:
945+ dput(newdentry);
946+out_put_cred:
947+ revert_creds(old_cred);
948+ put_cred(override_cred);
949+out:
950+ if (err) {
951+ /*
952+ * There's no way to recover from failure to whiteout.
953+ * What should we do? Log a big fat error and... ?
954+ */
955+ printk(KERN_ERR "overlayfs: ERROR - failed to whiteout '%s'\n",
956+ dentry->d_name.name);
957+ }
958+
959+ return err;
960+}
961+
962+static struct dentry *ovl_lookup_create(struct dentry *upperdir,
963+ struct dentry *template)
964+{
965+ int err;
966+ struct dentry *newdentry;
967+ struct qstr *name = &template->d_name;
968+
969+ newdentry = lookup_one_len(name->name, upperdir, name->len);
970+ if (IS_ERR(newdentry))
971+ return newdentry;
972+
973+ if (newdentry->d_inode) {
974+ const struct cred *old_cred;
975+ struct cred *override_cred;
976+
977+ /* No need to check whiteout if lower parent is non-existent */
978+ err = -EEXIST;
979+ if (!ovl_dentry_lower(template->d_parent))
980+ goto out_dput;
981+
982+ if (!S_ISLNK(newdentry->d_inode->i_mode))
983+ goto out_dput;
984+
985+ err = -ENOMEM;
986+ override_cred = prepare_creds();
987+ if (!override_cred)
988+ goto out_dput;
989+
990+ /*
991+ * CAP_SYS_ADMIN for getxattr
992+ * CAP_FOWNER for unlink in sticky directory
993+ */
994+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
995+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
996+ old_cred = override_creds(override_cred);
997+
998+ err = -EEXIST;
999+ if (ovl_is_whiteout(newdentry))
1000+ err = vfs_unlink(upperdir->d_inode, newdentry);
1001+
1002+ revert_creds(old_cred);
1003+ put_cred(override_cred);
1004+ if (err)
1005+ goto out_dput;
1006+
1007+ dput(newdentry);
1008+ newdentry = lookup_one_len(name->name, upperdir, name->len);
1009+ if (IS_ERR(newdentry)) {
1010+ ovl_whiteout(upperdir, template);
1011+ return newdentry;
1012+ }
1013+
1014+ /*
1015+ * Whiteout just been successfully removed, parent
1016+ * i_mutex is still held, there's no way the lookup
1017+ * could return positive.
1018+ */
1019+ WARN_ON(newdentry->d_inode);
1020+ }
1021+
1022+ return newdentry;
1023+
1024+out_dput:
1025+ dput(newdentry);
1026+ return ERR_PTR(err);
1027+}
1028+
1029+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
1030+ struct kstat *stat, const char *link)
1031+{
1032+ int err;
1033+ struct dentry *newdentry;
1034+ struct inode *dir = upperdir->d_inode;
1035+
1036+ newdentry = ovl_lookup_create(upperdir, dentry);
1037+ if (IS_ERR(newdentry))
1038+ goto out;
1039+
1040+ switch (stat->mode & S_IFMT) {
1041+ case S_IFREG:
1042+ err = vfs_create(dir, newdentry, stat->mode, NULL);
1043+ break;
1044+
1045+ case S_IFDIR:
1046+ err = vfs_mkdir(dir, newdentry, stat->mode);
1047+ break;
1048+
1049+ case S_IFCHR:
1050+ case S_IFBLK:
1051+ case S_IFIFO:
1052+ case S_IFSOCK:
1053+ err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev);
1054+ break;
1055+
1056+ case S_IFLNK:
1057+ err = vfs_symlink(dir, newdentry, link);
1058+ break;
1059+
1060+ default:
1061+ err = -EPERM;
1062+ }
1063+ if (err) {
1064+ if (ovl_dentry_is_opaque(dentry))
1065+ ovl_whiteout(upperdir, dentry);
1066+ dput(newdentry);
1067+ newdentry = ERR_PTR(err);
1068+ } else if (WARN_ON(!newdentry->d_inode)) {
1069+ /*
1070+ * Not quite sure if non-instantiated dentry is legal or not.
1071+ * VFS doesn't seem to care so check and warn here.
1072+ */
1073+ dput(newdentry);
1074+ newdentry = ERR_PTR(-ENOENT);
1075+ }
1076+
1077+out:
1078+ return newdentry;
1079+
1080+}
1081+
1082+static int ovl_set_opaque(struct dentry *upperdentry)
1083+{
1084+ int err;
1085+ const struct cred *old_cred;
1086+ struct cred *override_cred;
1087+
1088+ override_cred = prepare_creds();
1089+ if (!override_cred)
1090+ return -ENOMEM;
1091+
1092+ /* CAP_SYS_ADMIN for setxattr of "trusted" namespace */
1093+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1094+ old_cred = override_creds(override_cred);
1095+ err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
1096+ revert_creds(old_cred);
1097+ put_cred(override_cred);
1098+
1099+ return err;
1100+}
1101+
1102+static int ovl_remove_opaque(struct dentry *upperdentry)
1103+{
1104+ int err;
1105+ const struct cred *old_cred;
1106+ struct cred *override_cred;
1107+
1108+ override_cred = prepare_creds();
1109+ if (!override_cred)
1110+ return -ENOMEM;
1111+
1112+ /* CAP_SYS_ADMIN for removexattr of "trusted" namespace */
1113+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1114+ old_cred = override_creds(override_cred);
1115+ err = vfs_removexattr(upperdentry, ovl_opaque_xattr);
1116+ revert_creds(old_cred);
1117+ put_cred(override_cred);
1118+
1119+ return err;
1120+}
1121+
1122+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
1123+ struct kstat *stat)
1124+{
1125+ int err;
1126+ enum ovl_path_type type;
1127+ struct path realpath;
1128+
1129+ type = ovl_path_real(dentry, &realpath);
1130+ err = vfs_getattr(realpath.mnt, realpath.dentry, stat);
1131+ if (err)
1132+ return err;
1133+
1134+ stat->dev = dentry->d_sb->s_dev;
1135+ stat->ino = dentry->d_inode->i_ino;
1136+
1137+ /*
1138+ * It's probably not worth it to count subdirs to get the
1139+ * correct link count. nlink=1 seems to pacify 'find' and
1140+ * other utilities.
1141+ */
1142+ if (type == OVL_PATH_MERGE)
1143+ stat->nlink = 1;
1144+
1145+ return 0;
1146+}
1147+
1148+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
1149+ const char *link)
1150+{
1151+ int err;
1152+ struct dentry *newdentry;
1153+ struct dentry *upperdir;
1154+ struct inode *inode;
1155+ struct kstat stat = {
1156+ .mode = mode,
1157+ .rdev = rdev,
1158+ };
1159+
1160+ err = -ENOMEM;
1161+ inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
1162+ if (!inode)
1163+ goto out;
1164+
1165+ err = ovl_copy_up(dentry->d_parent);
1166+ if (err)
1167+ goto out_iput;
1168+
1169+ upperdir = ovl_dentry_upper(dentry->d_parent);
1170+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1171+
1172+ newdentry = ovl_upper_create(upperdir, dentry, &stat, link);
1173+ err = PTR_ERR(newdentry);
1174+ if (IS_ERR(newdentry))
1175+ goto out_unlock;
1176+
1177+ ovl_dentry_version_inc(dentry->d_parent);
1178+ if (ovl_dentry_is_opaque(dentry) && S_ISDIR(mode)) {
1179+ err = ovl_set_opaque(newdentry);
1180+ if (err) {
1181+ vfs_rmdir(upperdir->d_inode, newdentry);
1182+ ovl_whiteout(upperdir, dentry);
1183+ goto out_dput;
1184+ }
1185+ }
1186+ ovl_dentry_update(dentry, newdentry);
1187+ d_instantiate(dentry, inode);
1188+ inode = NULL;
1189+ newdentry = NULL;
1190+ err = 0;
1191+
1192+out_dput:
1193+ dput(newdentry);
1194+out_unlock:
1195+ mutex_unlock(&upperdir->d_inode->i_mutex);
1196+out_iput:
1197+ iput(inode);
1198+out:
1199+ return err;
1200+}
1201+
1202+static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
1203+ struct nameidata *nd)
1204+{
1205+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
1206+}
1207+
1208+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
1209+{
1210+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
1211+}
1212+
1213+static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
1214+ dev_t rdev)
1215+{
1216+ return ovl_create_object(dentry, mode, rdev, NULL);
1217+}
1218+
1219+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
1220+ const char *link)
1221+{
1222+ return ovl_create_object(dentry, S_IFLNK, 0, link);
1223+}
1224+
1225+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
1226+{
1227+ int err;
1228+ enum ovl_path_type type;
1229+ struct path realpath;
1230+ struct dentry *upperdir;
1231+
1232+ err = ovl_copy_up(dentry->d_parent);
1233+ if (err)
1234+ return err;
1235+
1236+ upperdir = ovl_dentry_upper(dentry->d_parent);
1237+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1238+ type = ovl_path_real(dentry, &realpath);
1239+ if (type != OVL_PATH_LOWER) {
1240+ err = -ESTALE;
1241+ if (realpath.dentry->d_parent != upperdir)
1242+ goto out_d_drop;
1243+
1244+ /* FIXME: create whiteout up front and rename to target */
1245+
1246+ if (is_dir)
1247+ err = vfs_rmdir(upperdir->d_inode, realpath.dentry);
1248+ else
1249+ err = vfs_unlink(upperdir->d_inode, realpath.dentry);
1250+ if (err)
1251+ goto out_d_drop;
1252+
1253+ ovl_dentry_version_inc(dentry->d_parent);
1254+ }
1255+
1256+ if (type != OVL_PATH_UPPER || ovl_dentry_is_opaque(dentry))
1257+ err = ovl_whiteout(upperdir, dentry);
1258+
1259+ /*
1260+ * Keeping this dentry hashed would mean having to release
1261+ * upperpath/lowerpath, which could only be done if we are the
1262+ * sole user of this dentry. Too tricky... Just unhash for
1263+ * now.
1264+ */
1265+out_d_drop:
1266+ d_drop(dentry);
1267+ mutex_unlock(&upperdir->d_inode->i_mutex);
1268+
1269+ return err;
1270+}
1271+
1272+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
1273+{
1274+ return ovl_do_remove(dentry, false);
1275+}
1276+
1277+
1278+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
1279+{
1280+ int err;
1281+ enum ovl_path_type type;
1282+
1283+ type = ovl_path_type(dentry);
1284+ if (type != OVL_PATH_UPPER) {
1285+ err = ovl_check_empty_and_clear(dentry, type);
1286+ if (err)
1287+ return err;
1288+ }
1289+
1290+ return ovl_do_remove(dentry, true);
1291+}
1292+
1293+static int ovl_link(struct dentry *old, struct inode *newdir,
1294+ struct dentry *new)
1295+{
1296+ int err;
1297+ struct dentry *olddentry;
1298+ struct dentry *newdentry;
1299+ struct dentry *upperdir;
1300+
1301+ err = ovl_copy_up(old);
1302+ if (err)
1303+ goto out;
1304+
1305+ err = ovl_copy_up(new->d_parent);
1306+ if (err)
1307+ goto out;
1308+
1309+ upperdir = ovl_dentry_upper(new->d_parent);
1310+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1311+ newdentry = ovl_lookup_create(upperdir, new);
1312+ err = PTR_ERR(newdentry);
1313+ if (IS_ERR(newdentry))
1314+ goto out_unlock;
1315+
1316+ olddentry = ovl_dentry_upper(old);
1317+ err = vfs_link(olddentry, upperdir->d_inode, newdentry);
1318+ if (!err) {
1319+ if (WARN_ON(!newdentry->d_inode)) {
1320+ dput(newdentry);
1321+ err = -ENOENT;
1322+ goto out_unlock;
1323+ }
1324+
1325+ ovl_dentry_version_inc(new->d_parent);
1326+ ovl_dentry_update(new, newdentry);
1327+
1328+ ihold(old->d_inode);
1329+ d_instantiate(new, old->d_inode);
1330+ } else {
1331+ if (ovl_dentry_is_opaque(new))
1332+ ovl_whiteout(upperdir, new);
1333+ dput(newdentry);
1334+ }
1335+out_unlock:
1336+ mutex_unlock(&upperdir->d_inode->i_mutex);
1337+out:
1338+ return err;
1339+
1340+}
1341+
1342+static int ovl_rename(struct inode *olddir, struct dentry *old,
1343+ struct inode *newdir, struct dentry *new)
1344+{
1345+ int err;
1346+ enum ovl_path_type old_type;
1347+ enum ovl_path_type new_type;
1348+ struct dentry *old_upperdir;
1349+ struct dentry *new_upperdir;
1350+ struct dentry *olddentry;
1351+ struct dentry *newdentry;
1352+ struct dentry *trap;
1353+ bool old_opaque;
1354+ bool new_opaque;
1355+ bool new_create = false;
1356+ bool is_dir = S_ISDIR(old->d_inode->i_mode);
1357+
1358+ /* Don't copy up directory trees */
1359+ old_type = ovl_path_type(old);
1360+ if (old_type != OVL_PATH_UPPER && is_dir)
1361+ return -EXDEV;
1362+
1363+ if (new->d_inode) {
1364+ new_type = ovl_path_type(new);
1365+
1366+ if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
1367+ if (ovl_dentry_lower(old)->d_inode ==
1368+ ovl_dentry_lower(new)->d_inode)
1369+ return 0;
1370+ }
1371+ if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
1372+ if (ovl_dentry_upper(old)->d_inode ==
1373+ ovl_dentry_upper(new)->d_inode)
1374+ return 0;
1375+ }
1376+
1377+ if (new_type != OVL_PATH_UPPER &&
1378+ S_ISDIR(new->d_inode->i_mode)) {
1379+ err = ovl_check_empty_and_clear(new, new_type);
1380+ if (err)
1381+ return err;
1382+ }
1383+ } else {
1384+ new_type = OVL_PATH_UPPER;
1385+ }
1386+
1387+ err = ovl_copy_up(old);
1388+ if (err)
1389+ return err;
1390+
1391+ err = ovl_copy_up(new->d_parent);
1392+ if (err)
1393+ return err;
1394+
1395+ old_upperdir = ovl_dentry_upper(old->d_parent);
1396+ new_upperdir = ovl_dentry_upper(new->d_parent);
1397+
1398+ trap = lock_rename(new_upperdir, old_upperdir);
1399+
1400+ olddentry = ovl_dentry_upper(old);
1401+ newdentry = ovl_dentry_upper(new);
1402+ if (newdentry) {
1403+ dget(newdentry);
1404+ } else {
1405+ new_create = true;
1406+ newdentry = ovl_lookup_create(new_upperdir, new);
1407+ err = PTR_ERR(newdentry);
1408+ if (IS_ERR(newdentry))
1409+ goto out_unlock;
1410+ }
1411+
1412+ err = -ESTALE;
1413+ if (olddentry->d_parent != old_upperdir)
1414+ goto out_dput;
1415+ if (newdentry->d_parent != new_upperdir)
1416+ goto out_dput;
1417+ if (olddentry == trap)
1418+ goto out_dput;
1419+ if (newdentry == trap)
1420+ goto out_dput;
1421+
1422+ old_opaque = ovl_dentry_is_opaque(old);
1423+ new_opaque = ovl_dentry_is_opaque(new) || new_type != OVL_PATH_UPPER;
1424+
1425+ if (is_dir && !old_opaque && new_opaque) {
1426+ err = ovl_set_opaque(olddentry);
1427+ if (err)
1428+ goto out_dput;
1429+ }
1430+
1431+ err = vfs_rename(old_upperdir->d_inode, olddentry,
1432+ new_upperdir->d_inode, newdentry);
1433+
1434+ if (err) {
1435+ if (new_create && ovl_dentry_is_opaque(new))
1436+ ovl_whiteout(new_upperdir, new);
1437+ if (is_dir && !old_opaque && new_opaque)
1438+ ovl_remove_opaque(olddentry);
1439+ goto out_dput;
1440+ }
1441+
1442+ if (old_type != OVL_PATH_UPPER || old_opaque)
1443+ err = ovl_whiteout(old_upperdir, old);
1444+ if (is_dir && old_opaque && !new_opaque)
1445+ ovl_remove_opaque(olddentry);
1446+
1447+ if (old_opaque != new_opaque)
1448+ ovl_dentry_set_opaque(old, new_opaque);
1449+
1450+ ovl_dentry_version_inc(old->d_parent);
1451+ ovl_dentry_version_inc(new->d_parent);
1452+
1453+out_dput:
1454+ dput(newdentry);
1455+out_unlock:
1456+ unlock_rename(new_upperdir, old_upperdir);
1457+ return err;
1458+}
1459+
1460+const struct inode_operations ovl_dir_inode_operations = {
1461+ .lookup = ovl_lookup,
1462+ .mkdir = ovl_mkdir,
1463+ .symlink = ovl_symlink,
1464+ .unlink = ovl_unlink,
1465+ .rmdir = ovl_rmdir,
1466+ .rename = ovl_rename,
1467+ .link = ovl_link,
1468+ .setattr = ovl_setattr,
1469+ .create = ovl_create,
1470+ .mknod = ovl_mknod,
1471+ .permission = ovl_permission,
1472+ .getattr = ovl_dir_getattr,
1473+ .setxattr = ovl_setxattr,
1474+ .getxattr = ovl_getxattr,
1475+ .listxattr = ovl_listxattr,
1476+ .removexattr = ovl_removexattr,
1477+};
1478--- /dev/null
1479+++ b/fs/overlayfs/inode.c
1480@@ -0,0 +1,384 @@
1481+/*
1482+ *
1483+ * Copyright (C) 2011 Novell Inc.
1484+ *
1485+ * This program is free software; you can redistribute it and/or modify it
1486+ * under the terms of the GNU General Public License version 2 as published by
1487+ * the Free Software Foundation.
1488+ */
1489+
1490+#include <linux/fs.h>
1491+#include <linux/slab.h>
1492+#include <linux/xattr.h>
1493+#include "overlayfs.h"
1494+
1495+int ovl_setattr(struct dentry *dentry, struct iattr *attr)
1496+{
1497+ struct dentry *upperdentry;
1498+ int err;
1499+
1500+ if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry))
1501+ err = ovl_copy_up_truncate(dentry, attr->ia_size);
1502+ else
1503+ err = ovl_copy_up(dentry);
1504+ if (err)
1505+ return err;
1506+
1507+ upperdentry = ovl_dentry_upper(dentry);
1508+
1509+ if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
1510+ attr->ia_valid &= ~ATTR_MODE;
1511+
1512+ mutex_lock(&upperdentry->d_inode->i_mutex);
1513+ err = notify_change(upperdentry, attr);
1514+ mutex_unlock(&upperdentry->d_inode->i_mutex);
1515+
1516+ return err;
1517+}
1518+
1519+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
1520+ struct kstat *stat)
1521+{
1522+ struct path realpath;
1523+
1524+ ovl_path_real(dentry, &realpath);
1525+ return vfs_getattr(realpath.mnt, realpath.dentry, stat);
1526+}
1527+
1528+int ovl_permission(struct inode *inode, int mask)
1529+{
1530+ struct ovl_entry *oe;
1531+ struct dentry *alias = NULL;
1532+ struct inode *realinode;
1533+ struct dentry *realdentry;
1534+ bool is_upper;
1535+ int err;
1536+
1537+ if (S_ISDIR(inode->i_mode)) {
1538+ oe = inode->i_private;
1539+ } else if (mask & MAY_NOT_BLOCK) {
1540+ return -ECHILD;
1541+ } else {
1542+ /*
1543+ * For non-directories find an alias and get the info
1544+ * from there.
1545+ */
1546+ spin_lock(&inode->i_lock);
1547+ if (WARN_ON(list_empty(&inode->i_dentry))) {
1548+ spin_unlock(&inode->i_lock);
1549+ return -ENOENT;
1550+ }
1551+ alias = list_entry(inode->i_dentry.next,
1552+ struct dentry, d_alias);
1553+ dget(alias);
1554+ spin_unlock(&inode->i_lock);
1555+ oe = alias->d_fsdata;
1556+ }
1557+
1558+ realdentry = ovl_entry_real(oe, &is_upper);
1559+
1560+ /* Careful in RCU walk mode */
1561+ realinode = ACCESS_ONCE(realdentry->d_inode);
1562+ if (!realinode) {
1563+ WARN_ON(!(mask & MAY_NOT_BLOCK));
1564+ err = -ENOENT;
1565+ goto out_dput;
1566+ }
1567+
1568+ if (mask & MAY_WRITE) {
1569+ umode_t mode = realinode->i_mode;
1570+
1571+ /*
1572+ * Writes will always be redirected to upper layer, so
1573+ * ignore lower layer being read-only.
1574+ *
1575+ * If the overlay itself is read-only then proceed
1576+ * with the permission check, don't return EROFS.
1577+ * This will only happen if this is the lower layer of
1578+ * another overlayfs.
1579+ *
1580+ * If upper fs becomes read-only after the overlay was
1581+ * constructed return EROFS to prevent modification of
1582+ * upper layer.
1583+ */
1584+ err = -EROFS;
1585+ if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
1586+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1587+ goto out_dput;
1588+
1589+ /*
1590+ * Nobody gets write access to an immutable file.
1591+ */
1592+ err = -EACCES;
1593+ if (IS_IMMUTABLE(realinode))
1594+ goto out_dput;
1595+ }
1596+
1597+ if (realinode->i_op->permission)
1598+ err = realinode->i_op->permission(realinode, mask);
1599+ else
1600+ err = generic_permission(realinode, mask);
1601+out_dput:
1602+ dput(alias);
1603+ return err;
1604+}
1605+
1606+
1607+struct ovl_link_data {
1608+ struct dentry *realdentry;
1609+ void *cookie;
1610+};
1611+
1612+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
1613+{
1614+ void *ret;
1615+ struct dentry *realdentry;
1616+ struct inode *realinode;
1617+
1618+ realdentry = ovl_dentry_real(dentry);
1619+ realinode = realdentry->d_inode;
1620+
1621+ if (WARN_ON(!realinode->i_op->follow_link))
1622+ return ERR_PTR(-EPERM);
1623+
1624+ ret = realinode->i_op->follow_link(realdentry, nd);
1625+ if (IS_ERR(ret))
1626+ return ret;
1627+
1628+ if (realinode->i_op->put_link) {
1629+ struct ovl_link_data *data;
1630+
1631+ data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
1632+ if (!data) {
1633+ realinode->i_op->put_link(realdentry, nd, ret);
1634+ return ERR_PTR(-ENOMEM);
1635+ }
1636+ data->realdentry = realdentry;
1637+ data->cookie = ret;
1638+
1639+ return data;
1640+ } else {
1641+ return NULL;
1642+ }
1643+}
1644+
1645+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
1646+{
1647+ struct inode *realinode;
1648+ struct ovl_link_data *data = c;
1649+
1650+ if (!data)
1651+ return;
1652+
1653+ realinode = data->realdentry->d_inode;
1654+ realinode->i_op->put_link(data->realdentry, nd, data->cookie);
1655+ kfree(data);
1656+}
1657+
1658+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
1659+{
1660+ struct path realpath;
1661+ struct inode *realinode;
1662+
1663+ ovl_path_real(dentry, &realpath);
1664+ realinode = realpath.dentry->d_inode;
1665+
1666+ if (!realinode->i_op->readlink)
1667+ return -EINVAL;
1668+
1669+ touch_atime(realpath.mnt, realpath.dentry);
1670+
1671+ return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
1672+}
1673+
1674+
1675+static bool ovl_is_private_xattr(const char *name)
1676+{
1677+ return strncmp(name, "trusted.overlay.", 14) == 0;
1678+}
1679+
1680+int ovl_setxattr(struct dentry *dentry, const char *name,
1681+ const void *value, size_t size, int flags)
1682+{
1683+ int err;
1684+ struct dentry *upperdentry;
1685+
1686+ if (ovl_is_private_xattr(name))
1687+ return -EPERM;
1688+
1689+ err = ovl_copy_up(dentry);
1690+ if (err)
1691+ return err;
1692+
1693+ upperdentry = ovl_dentry_upper(dentry);
1694+ return vfs_setxattr(upperdentry, name, value, size, flags);
1695+}
1696+
1697+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
1698+ void *value, size_t size)
1699+{
1700+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
1701+ ovl_is_private_xattr(name))
1702+ return -ENODATA;
1703+
1704+ return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
1705+}
1706+
1707+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
1708+{
1709+ ssize_t res;
1710+ int off;
1711+
1712+ res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
1713+ if (res <= 0 || size == 0)
1714+ return res;
1715+
1716+ if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
1717+ return res;
1718+
1719+ /* filter out private xattrs */
1720+ for (off = 0; off < res;) {
1721+ char *s = list + off;
1722+ size_t slen = strlen(s) + 1;
1723+
1724+ BUG_ON(off + slen > res);
1725+
1726+ if (ovl_is_private_xattr(s)) {
1727+ res -= slen;
1728+ memmove(s, s + slen, res - off);
1729+ } else {
1730+ off += slen;
1731+ }
1732+ }
1733+
1734+ return res;
1735+}
1736+
1737+int ovl_removexattr(struct dentry *dentry, const char *name)
1738+{
1739+ int err;
1740+ struct path realpath;
1741+ enum ovl_path_type type;
1742+
1743+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
1744+ ovl_is_private_xattr(name))
1745+ return -ENODATA;
1746+
1747+ type = ovl_path_real(dentry, &realpath);
1748+ if (type == OVL_PATH_LOWER) {
1749+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
1750+ if (err < 0)
1751+ return err;
1752+
1753+ err = ovl_copy_up(dentry);
1754+ if (err)
1755+ return err;
1756+
1757+ ovl_path_upper(dentry, &realpath);
1758+ }
1759+
1760+ return vfs_removexattr(realpath.dentry, name);
1761+}
1762+
1763+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
1764+ struct dentry *realdentry)
1765+{
1766+ if (type != OVL_PATH_LOWER)
1767+ return false;
1768+
1769+ if (special_file(realdentry->d_inode->i_mode))
1770+ return false;
1771+
1772+ if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
1773+ return false;
1774+
1775+ return true;
1776+}
1777+
1778+static struct file *ovl_open(struct dentry *dentry, struct file *file,
1779+ const struct cred *cred)
1780+{
1781+ int err;
1782+ struct path realpath;
1783+ enum ovl_path_type type;
1784+
1785+ type = ovl_path_real(dentry, &realpath);
1786+ if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
1787+ if (file->f_flags & O_TRUNC)
1788+ err = ovl_copy_up_truncate(dentry, 0);
1789+ else
1790+ err = ovl_copy_up(dentry);
1791+ if (err)
1792+ return ERR_PTR(err);
1793+
1794+ ovl_path_upper(dentry, &realpath);
1795+ }
1796+
1797+ return vfs_open(&realpath, file, cred);
1798+}
1799+
1800+static const struct inode_operations ovl_file_inode_operations = {
1801+ .setattr = ovl_setattr,
1802+ .permission = ovl_permission,
1803+ .getattr = ovl_getattr,
1804+ .setxattr = ovl_setxattr,
1805+ .getxattr = ovl_getxattr,
1806+ .listxattr = ovl_listxattr,
1807+ .removexattr = ovl_removexattr,
1808+ .open = ovl_open,
1809+};
1810+
1811+static const struct inode_operations ovl_symlink_inode_operations = {
1812+ .setattr = ovl_setattr,
1813+ .follow_link = ovl_follow_link,
1814+ .put_link = ovl_put_link,
1815+ .readlink = ovl_readlink,
1816+ .getattr = ovl_getattr,
1817+ .setxattr = ovl_setxattr,
1818+ .getxattr = ovl_getxattr,
1819+ .listxattr = ovl_listxattr,
1820+ .removexattr = ovl_removexattr,
1821+};
1822+
1823+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
1824+ struct ovl_entry *oe)
1825+{
1826+ struct inode *inode;
1827+
1828+ inode = new_inode(sb);
1829+ if (!inode)
1830+ return NULL;
1831+
1832+ mode &= S_IFMT;
1833+
1834+ inode->i_ino = get_next_ino();
1835+ inode->i_mode = mode;
1836+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
1837+
1838+ switch (mode) {
1839+ case S_IFDIR:
1840+ inode->i_private = oe;
1841+ inode->i_op = &ovl_dir_inode_operations;
1842+ inode->i_fop = &ovl_dir_operations;
1843+ break;
1844+
1845+ case S_IFLNK:
1846+ inode->i_op = &ovl_symlink_inode_operations;
1847+ break;
1848+
1849+ case S_IFREG:
1850+ case S_IFSOCK:
1851+ case S_IFBLK:
1852+ case S_IFCHR:
1853+ case S_IFIFO:
1854+ inode->i_op = &ovl_file_inode_operations;
1855+ break;
1856+
1857+ default:
1858+ WARN(1, "illegal file type: %i\n", mode);
1859+ inode = NULL;
1860+ }
1861+
1862+ return inode;
1863+
1864+}
1865--- /dev/null
1866+++ b/fs/overlayfs/overlayfs.h
1867@@ -0,0 +1,64 @@
1868+/*
1869+ *
1870+ * Copyright (C) 2011 Novell Inc.
1871+ *
1872+ * This program is free software; you can redistribute it and/or modify it
1873+ * under the terms of the GNU General Public License version 2 as published by
1874+ * the Free Software Foundation.
1875+ */
1876+
1877+struct ovl_entry;
1878+
1879+enum ovl_path_type {
1880+ OVL_PATH_UPPER,
1881+ OVL_PATH_MERGE,
1882+ OVL_PATH_LOWER,
1883+};
1884+
1885+extern const char *ovl_opaque_xattr;
1886+extern const char *ovl_whiteout_xattr;
1887+extern const struct dentry_operations ovl_dentry_operations;
1888+
1889+enum ovl_path_type ovl_path_type(struct dentry *dentry);
1890+u64 ovl_dentry_version_get(struct dentry *dentry);
1891+void ovl_dentry_version_inc(struct dentry *dentry);
1892+void ovl_path_upper(struct dentry *dentry, struct path *path);
1893+void ovl_path_lower(struct dentry *dentry, struct path *path);
1894+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
1895+struct dentry *ovl_dentry_upper(struct dentry *dentry);
1896+struct dentry *ovl_dentry_lower(struct dentry *dentry);
1897+struct dentry *ovl_dentry_real(struct dentry *dentry);
1898+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
1899+bool ovl_dentry_is_opaque(struct dentry *dentry);
1900+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
1901+bool ovl_is_whiteout(struct dentry *dentry);
1902+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
1903+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
1904+ struct nameidata *nd);
1905+struct file *ovl_path_open(struct path *path, int flags);
1906+
1907+struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
1908+ struct kstat *stat, const char *link);
1909+
1910+/* readdir.c */
1911+extern const struct file_operations ovl_dir_operations;
1912+int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type);
1913+
1914+/* inode.c */
1915+int ovl_setattr(struct dentry *dentry, struct iattr *attr);
1916+int ovl_permission(struct inode *inode, int mask);
1917+int ovl_setxattr(struct dentry *dentry, const char *name,
1918+ const void *value, size_t size, int flags);
1919+ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
1920+ void *value, size_t size);
1921+ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
1922+int ovl_removexattr(struct dentry *dentry, const char *name);
1923+
1924+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
1925+ struct ovl_entry *oe);
1926+/* dir.c */
1927+extern const struct inode_operations ovl_dir_inode_operations;
1928+
1929+/* copy_up.c */
1930+int ovl_copy_up(struct dentry *dentry);
1931+int ovl_copy_up_truncate(struct dentry *dentry, loff_t size);
1932--- /dev/null
1933+++ b/fs/overlayfs/readdir.c
1934@@ -0,0 +1,565 @@
1935+/*
1936+ *
1937+ * Copyright (C) 2011 Novell Inc.
1938+ *
1939+ * This program is free software; you can redistribute it and/or modify it
1940+ * under the terms of the GNU General Public License version 2 as published by
1941+ * the Free Software Foundation.
1942+ */
1943+
1944+#include <linux/fs.h>
1945+#include <linux/slab.h>
1946+#include <linux/namei.h>
1947+#include <linux/file.h>
1948+#include <linux/xattr.h>
1949+#include <linux/rbtree.h>
1950+#include <linux/security.h>
1951+#include "overlayfs.h"
1952+
1953+struct ovl_cache_entry {
1954+ const char *name;
1955+ unsigned int len;
1956+ unsigned int type;
1957+ u64 ino;
1958+ bool is_whiteout;
1959+ struct list_head l_node;
1960+ struct rb_node node;
1961+};
1962+
1963+struct ovl_readdir_data {
1964+ struct rb_root *root;
1965+ struct list_head *list;
1966+ struct list_head *middle;
1967+ struct dentry *dir;
1968+ int count;
1969+ int err;
1970+};
1971+
1972+struct ovl_dir_file {
1973+ bool is_real;
1974+ bool is_cached;
1975+ struct list_head cursor;
1976+ u64 cache_version;
1977+ struct list_head cache;
1978+ struct file *realfile;
1979+};
1980+
1981+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
1982+{
1983+ return container_of(n, struct ovl_cache_entry, node);
1984+}
1985+
1986+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
1987+ const char *name, int len)
1988+{
1989+ struct rb_node *node = root->rb_node;
1990+ int cmp;
1991+
1992+ while (node) {
1993+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
1994+
1995+ cmp = strncmp(name, p->name, len);
1996+ if (cmp > 0)
1997+ node = p->node.rb_right;
1998+ else if (cmp < 0 || len < p->len)
1999+ node = p->node.rb_left;
2000+ else
2001+ return p;
2002+ }
2003+
2004+ return NULL;
2005+}
2006+
2007+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
2008+ u64 ino, unsigned int d_type)
2009+{
2010+ struct ovl_cache_entry *p;
2011+
2012+ p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL);
2013+ if (p) {
2014+ char *name_copy = (char *) (p + 1);
2015+ memcpy(name_copy, name, len);
2016+ name_copy[len] = '\0';
2017+ p->name = name_copy;
2018+ p->len = len;
2019+ p->type = d_type;
2020+ p->ino = ino;
2021+ p->is_whiteout = false;
2022+ }
2023+
2024+ return p;
2025+}
2026+
2027+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
2028+ const char *name, int len, u64 ino,
2029+ unsigned int d_type)
2030+{
2031+ struct rb_node **newp = &rdd->root->rb_node;
2032+ struct rb_node *parent = NULL;
2033+ struct ovl_cache_entry *p;
2034+
2035+ while (*newp) {
2036+ int cmp;
2037+ struct ovl_cache_entry *tmp;
2038+
2039+ parent = *newp;
2040+ tmp = ovl_cache_entry_from_node(*newp);
2041+ cmp = strncmp(name, tmp->name, len);
2042+ if (cmp > 0)
2043+ newp = &tmp->node.rb_right;
2044+ else if (cmp < 0 || len < tmp->len)
2045+ newp = &tmp->node.rb_left;
2046+ else
2047+ return 0;
2048+ }
2049+
2050+ p = ovl_cache_entry_new(name, len, ino, d_type);
2051+ if (p == NULL)
2052+ return -ENOMEM;
2053+
2054+ list_add_tail(&p->l_node, rdd->list);
2055+ rb_link_node(&p->node, parent, newp);
2056+ rb_insert_color(&p->node, rdd->root);
2057+
2058+ return 0;
2059+}
2060+
2061+static int ovl_fill_lower(void *buf, const char *name, int namelen,
2062+ loff_t offset, u64 ino, unsigned int d_type)
2063+{
2064+ struct ovl_readdir_data *rdd = buf;
2065+ struct ovl_cache_entry *p;
2066+
2067+ rdd->count++;
2068+ p = ovl_cache_entry_find(rdd->root, name, namelen);
2069+ if (p) {
2070+ list_move_tail(&p->l_node, rdd->middle);
2071+ } else {
2072+ p = ovl_cache_entry_new(name, namelen, ino, d_type);
2073+ if (p == NULL)
2074+ rdd->err = -ENOMEM;
2075+ else
2076+ list_add_tail(&p->l_node, rdd->middle);
2077+ }
2078+
2079+ return rdd->err;
2080+}
2081+
2082+static void ovl_cache_free(struct list_head *list)
2083+{
2084+ struct ovl_cache_entry *p;
2085+ struct ovl_cache_entry *n;
2086+
2087+ list_for_each_entry_safe(p, n, list, l_node)
2088+ kfree(p);
2089+
2090+ INIT_LIST_HEAD(list);
2091+}
2092+
2093+static int ovl_fill_upper(void *buf, const char *name, int namelen,
2094+ loff_t offset, u64 ino, unsigned int d_type)
2095+{
2096+ struct ovl_readdir_data *rdd = buf;
2097+
2098+ rdd->count++;
2099+ return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
2100+}
2101+
2102+static inline int ovl_dir_read(struct path *realpath,
2103+ struct ovl_readdir_data *rdd, filldir_t filler)
2104+{
2105+ struct file *realfile;
2106+ int err;
2107+
2108+ realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
2109+ if (IS_ERR(realfile))
2110+ return PTR_ERR(realfile);
2111+
2112+ do {
2113+ rdd->count = 0;
2114+ rdd->err = 0;
2115+ err = vfs_readdir(realfile, filler, rdd);
2116+ if (err >= 0)
2117+ err = rdd->err;
2118+ } while (!err && rdd->count);
2119+ fput(realfile);
2120+
2121+ return 0;
2122+}
2123+
2124+static void ovl_dir_reset(struct file *file)
2125+{
2126+ struct ovl_dir_file *od = file->private_data;
2127+ enum ovl_path_type type = ovl_path_type(file->f_path.dentry);
2128+
2129+ if (ovl_dentry_version_get(file->f_path.dentry) != od->cache_version) {
2130+ list_del_init(&od->cursor);
2131+ ovl_cache_free(&od->cache);
2132+ od->is_cached = false;
2133+ }
2134+ WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
2135+ if (od->is_real && type == OVL_PATH_MERGE) {
2136+ fput(od->realfile);
2137+ od->realfile = NULL;
2138+ od->is_real = false;
2139+ }
2140+}
2141+
2142+static int ovl_dir_mark_whiteouts(struct ovl_readdir_data *rdd)
2143+{
2144+ struct ovl_cache_entry *p;
2145+ struct dentry *dentry;
2146+ const struct cred *old_cred;
2147+ struct cred *override_cred;
2148+
2149+ override_cred = prepare_creds();
2150+ if (!override_cred) {
2151+ ovl_cache_free(rdd->list);
2152+ return -ENOMEM;
2153+ }
2154+
2155+ /*
2156+ * CAP_SYS_ADMIN for getxattr
2157+ * CAP_DAC_OVERRIDE for lookup
2158+ */
2159+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2160+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
2161+ old_cred = override_creds(override_cred);
2162+
2163+ mutex_lock(&rdd->dir->d_inode->i_mutex);
2164+ list_for_each_entry(p, rdd->list, l_node) {
2165+ if (p->type != DT_LNK)
2166+ continue;
2167+
2168+ dentry = lookup_one_len(p->name, rdd->dir, p->len);
2169+ if (IS_ERR(dentry))
2170+ continue;
2171+
2172+ p->is_whiteout = ovl_is_whiteout(dentry);
2173+ dput(dentry);
2174+ }
2175+ mutex_unlock(&rdd->dir->d_inode->i_mutex);
2176+
2177+ revert_creds(old_cred);
2178+ put_cred(override_cred);
2179+
2180+ return 0;
2181+}
2182+
2183+static inline int ovl_dir_read_merged(struct path *upperpath,
2184+ struct path *lowerpath,
2185+ struct ovl_readdir_data *rdd)
2186+{
2187+ int err;
2188+ struct rb_root root = RB_ROOT;
2189+ struct list_head middle;
2190+
2191+ rdd->root = &root;
2192+ if (upperpath->dentry) {
2193+ rdd->dir = upperpath->dentry;
2194+ err = ovl_dir_read(upperpath, rdd, ovl_fill_upper);
2195+ if (err)
2196+ goto out;
2197+
2198+ err = ovl_dir_mark_whiteouts(rdd);
2199+ if (err)
2200+ goto out;
2201+ }
2202+ /*
2203+ * Insert lowerpath entries before upperpath ones, this allows
2204+ * offsets to be reasonably constant
2205+ */
2206+ list_add(&middle, rdd->list);
2207+ rdd->middle = &middle;
2208+ err = ovl_dir_read(lowerpath, rdd, ovl_fill_lower);
2209+ list_del(&middle);
2210+out:
2211+ rdd->root = NULL;
2212+
2213+ return err;
2214+}
2215+
2216+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
2217+{
2218+ struct list_head *l;
2219+ loff_t off;
2220+
2221+ l = od->cache.next;
2222+ for (off = 0; off < pos; off++) {
2223+ if (l == &od->cache)
2224+ break;
2225+ l = l->next;
2226+ }
2227+ list_move_tail(&od->cursor, l);
2228+}
2229+
2230+static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
2231+{
2232+ struct ovl_dir_file *od = file->private_data;
2233+ int res;
2234+
2235+ if (!file->f_pos)
2236+ ovl_dir_reset(file);
2237+
2238+ if (od->is_real) {
2239+ res = vfs_readdir(od->realfile, filler, buf);
2240+ file->f_pos = od->realfile->f_pos;
2241+
2242+ return res;
2243+ }
2244+
2245+ if (!od->is_cached) {
2246+ struct path lowerpath;
2247+ struct path upperpath;
2248+ struct ovl_readdir_data rdd = { .list = &od->cache };
2249+
2250+ ovl_path_lower(file->f_path.dentry, &lowerpath);
2251+ ovl_path_upper(file->f_path.dentry, &upperpath);
2252+
2253+ res = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2254+ if (res) {
2255+ ovl_cache_free(rdd.list);
2256+ return res;
2257+ }
2258+
2259+ od->cache_version = ovl_dentry_version_get(file->f_path.dentry);
2260+ od->is_cached = true;
2261+
2262+ ovl_seek_cursor(od, file->f_pos);
2263+ }
2264+
2265+ while (od->cursor.next != &od->cache) {
2266+ int over;
2267+ loff_t off;
2268+ struct ovl_cache_entry *p;
2269+
2270+ p = list_entry(od->cursor.next, struct ovl_cache_entry, l_node);
2271+ off = file->f_pos;
2272+ if (!p->is_whiteout) {
2273+ over = filler(buf, p->name, p->len, off, p->ino,
2274+ p->type);
2275+ if (over)
2276+ break;
2277+ }
2278+ file->f_pos++;
2279+ list_move(&od->cursor, &p->l_node);
2280+ }
2281+
2282+ return 0;
2283+}
2284+
2285+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
2286+{
2287+ loff_t res;
2288+ struct ovl_dir_file *od = file->private_data;
2289+
2290+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
2291+ if (!file->f_pos)
2292+ ovl_dir_reset(file);
2293+
2294+ if (od->is_real) {
2295+ res = vfs_llseek(od->realfile, offset, origin);
2296+ file->f_pos = od->realfile->f_pos;
2297+ } else {
2298+ res = -EINVAL;
2299+
2300+ switch (origin) {
2301+ case SEEK_CUR:
2302+ offset += file->f_pos;
2303+ break;
2304+ case SEEK_SET:
2305+ break;
2306+ default:
2307+ goto out_unlock;
2308+ }
2309+ if (offset < 0)
2310+ goto out_unlock;
2311+
2312+ if (offset != file->f_pos) {
2313+ file->f_pos = offset;
2314+ if (od->is_cached)
2315+ ovl_seek_cursor(od, offset);
2316+ }
2317+ res = offset;
2318+ }
2319+out_unlock:
2320+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
2321+
2322+ return res;
2323+}
2324+
2325+static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
2326+ int datasync)
2327+{
2328+ struct ovl_dir_file *od = file->private_data;
2329+
2330+ /* May need to reopen directory if it got copied up */
2331+ if (!od->realfile) {
2332+ struct path upperpath;
2333+
2334+ ovl_path_upper(file->f_path.dentry, &upperpath);
2335+ od->realfile = ovl_path_open(&upperpath, O_RDONLY);
2336+ if (IS_ERR(od->realfile))
2337+ return PTR_ERR(od->realfile);
2338+ }
2339+
2340+ return vfs_fsync_range(od->realfile, start, end, datasync);
2341+}
2342+
2343+static int ovl_dir_release(struct inode *inode, struct file *file)
2344+{
2345+ struct ovl_dir_file *od = file->private_data;
2346+
2347+ list_del(&od->cursor);
2348+ ovl_cache_free(&od->cache);
2349+ if (od->realfile)
2350+ fput(od->realfile);
2351+ kfree(od);
2352+
2353+ return 0;
2354+}
2355+
2356+static int ovl_dir_open(struct inode *inode, struct file *file)
2357+{
2358+ struct path realpath;
2359+ struct file *realfile;
2360+ struct ovl_dir_file *od;
2361+ enum ovl_path_type type;
2362+
2363+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
2364+ if (!od)
2365+ return -ENOMEM;
2366+
2367+ type = ovl_path_real(file->f_path.dentry, &realpath);
2368+ realfile = ovl_path_open(&realpath, file->f_flags);
2369+ if (IS_ERR(realfile)) {
2370+ kfree(od);
2371+ return PTR_ERR(realfile);
2372+ }
2373+ INIT_LIST_HEAD(&od->cache);
2374+ INIT_LIST_HEAD(&od->cursor);
2375+ od->is_cached = false;
2376+ od->realfile = realfile;
2377+ od->is_real = (type != OVL_PATH_MERGE);
2378+ file->private_data = od;
2379+
2380+ return 0;
2381+}
2382+
2383+const struct file_operations ovl_dir_operations = {
2384+ .read = generic_read_dir,
2385+ .open = ovl_dir_open,
2386+ .readdir = ovl_readdir,
2387+ .llseek = ovl_dir_llseek,
2388+ .fsync = ovl_dir_fsync,
2389+ .release = ovl_dir_release,
2390+};
2391+
2392+static int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
2393+{
2394+ int err;
2395+ struct path lowerpath;
2396+ struct path upperpath;
2397+ struct ovl_cache_entry *p;
2398+ struct ovl_readdir_data rdd = { .list = list };
2399+
2400+ ovl_path_upper(dentry, &upperpath);
2401+ ovl_path_lower(dentry, &lowerpath);
2402+
2403+ err = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2404+ if (err)
2405+ return err;
2406+
2407+ err = 0;
2408+
2409+ list_for_each_entry(p, list, l_node) {
2410+ if (p->is_whiteout)
2411+ continue;
2412+
2413+ if (p->name[0] == '.') {
2414+ if (p->len == 1)
2415+ continue;
2416+ if (p->len == 2 && p->name[1] == '.')
2417+ continue;
2418+ }
2419+ err = -ENOTEMPTY;
2420+ break;
2421+ }
2422+
2423+ return err;
2424+}
2425+
2426+static int ovl_remove_whiteouts(struct dentry *dir, struct list_head *list)
2427+{
2428+ struct path upperpath;
2429+ struct dentry *upperdir;
2430+ struct ovl_cache_entry *p;
2431+ const struct cred *old_cred;
2432+ struct cred *override_cred;
2433+ int err;
2434+
2435+ ovl_path_upper(dir, &upperpath);
2436+ upperdir = upperpath.dentry;
2437+
2438+ override_cred = prepare_creds();
2439+ if (!override_cred)
2440+ return -ENOMEM;
2441+
2442+ /*
2443+ * CAP_DAC_OVERRIDE for lookup and unlink
2444+ * CAP_SYS_ADMIN for setxattr of "trusted" namespace
2445+ * CAP_FOWNER for unlink in sticky directory
2446+ */
2447+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
2448+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2449+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
2450+ old_cred = override_creds(override_cred);
2451+
2452+ err = vfs_setxattr(upperdir, ovl_opaque_xattr, "y", 1, 0);
2453+ if (err)
2454+ goto out_revert_creds;
2455+
2456+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
2457+ list_for_each_entry(p, list, l_node) {
2458+ struct dentry *dentry;
2459+ int ret;
2460+
2461+ if (!p->is_whiteout)
2462+ continue;
2463+
2464+ dentry = lookup_one_len(p->name, upperdir, p->len);
2465+ if (IS_ERR(dentry)) {
2466+ printk(KERN_WARNING
2467+ "overlayfs: failed to lookup whiteout %.*s: %li\n",
2468+ p->len, p->name, PTR_ERR(dentry));
2469+ continue;
2470+ }
2471+ ret = vfs_unlink(upperdir->d_inode, dentry);
2472+ dput(dentry);
2473+ if (ret)
2474+ printk(KERN_WARNING
2475+ "overlayfs: failed to unlink whiteout %.*s: %i\n",
2476+ p->len, p->name, ret);
2477+ }
2478+ mutex_unlock(&upperdir->d_inode->i_mutex);
2479+
2480+out_revert_creds:
2481+ revert_creds(old_cred);
2482+ put_cred(override_cred);
2483+
2484+ return err;
2485+}
2486+
2487+int ovl_check_empty_and_clear(struct dentry *dentry, enum ovl_path_type type)
2488+{
2489+ int err;
2490+ LIST_HEAD(list);
2491+
2492+ err = ovl_check_empty_dir(dentry, &list);
2493+ if (!err && type == OVL_PATH_MERGE)
2494+ err = ovl_remove_whiteouts(dentry, &list);
2495+
2496+ ovl_cache_free(&list);
2497+
2498+ return err;
2499+}
2500--- /dev/null
2501+++ b/fs/overlayfs/super.c
2502@@ -0,0 +1,664 @@
2503+/*
2504+ *
2505+ * Copyright (C) 2011 Novell Inc.
2506+ *
2507+ * This program is free software; you can redistribute it and/or modify it
2508+ * under the terms of the GNU General Public License version 2 as published by
2509+ * the Free Software Foundation.
2510+ */
2511+
2512+#include <linux/fs.h>
2513+#include <linux/namei.h>
2514+#include <linux/xattr.h>
2515+#include <linux/security.h>
2516+#include <linux/mount.h>
2517+#include <linux/slab.h>
2518+#include <linux/parser.h>
2519+#include <linux/module.h>
2520+#include <linux/seq_file.h>
2521+#include "overlayfs.h"
2522+
2523+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
2524+MODULE_DESCRIPTION("Overlay filesystem");
2525+MODULE_LICENSE("GPL");
2526+
2527+struct ovl_config {
2528+ char *lowerdir;
2529+ char *upperdir;
2530+};
2531+
2532+/* private information held for overlayfs's superblock */
2533+struct ovl_fs {
2534+ struct vfsmount *upper_mnt;
2535+ struct vfsmount *lower_mnt;
2536+ /* pathnames of lower and upper dirs, for show_options */
2537+ struct ovl_config config;
2538+};
2539+
2540+/* private information held for every overlayfs dentry */
2541+struct ovl_entry {
2542+ /*
2543+ * Keep "double reference" on upper dentries, so that
2544+ * d_delete() doesn't think it's OK to reset d_inode to NULL.
2545+ */
2546+ struct dentry *__upperdentry;
2547+ struct dentry *lowerdentry;
2548+ union {
2549+ struct {
2550+ u64 version;
2551+ bool opaque;
2552+ };
2553+ struct rcu_head rcu;
2554+ };
2555+};
2556+
2557+const char *ovl_whiteout_xattr = "trusted.overlay.whiteout";
2558+const char *ovl_opaque_xattr = "trusted.overlay.opaque";
2559+
2560+
2561+enum ovl_path_type ovl_path_type(struct dentry *dentry)
2562+{
2563+ struct ovl_entry *oe = dentry->d_fsdata;
2564+
2565+ if (oe->__upperdentry) {
2566+ if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode))
2567+ return OVL_PATH_MERGE;
2568+ else
2569+ return OVL_PATH_UPPER;
2570+ } else {
2571+ return OVL_PATH_LOWER;
2572+ }
2573+}
2574+
2575+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
2576+{
2577+ struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
2578+ smp_read_barrier_depends();
2579+ return upperdentry;
2580+}
2581+
2582+void ovl_path_upper(struct dentry *dentry, struct path *path)
2583+{
2584+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
2585+ struct ovl_entry *oe = dentry->d_fsdata;
2586+
2587+ path->mnt = ofs->upper_mnt;
2588+ path->dentry = ovl_upperdentry_dereference(oe);
2589+}
2590+
2591+void ovl_path_lower(struct dentry *dentry, struct path *path)
2592+{
2593+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
2594+ struct ovl_entry *oe = dentry->d_fsdata;
2595+
2596+ path->mnt = ofs->lower_mnt;
2597+ path->dentry = oe->lowerdentry;
2598+}
2599+
2600+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
2601+{
2602+
2603+ enum ovl_path_type type = ovl_path_type(dentry);
2604+
2605+ if (type == OVL_PATH_LOWER)
2606+ ovl_path_lower(dentry, path);
2607+ else
2608+ ovl_path_upper(dentry, path);
2609+
2610+ return type;
2611+}
2612+
2613+struct dentry *ovl_dentry_upper(struct dentry *dentry)
2614+{
2615+ struct ovl_entry *oe = dentry->d_fsdata;
2616+
2617+ return ovl_upperdentry_dereference(oe);
2618+}
2619+
2620+struct dentry *ovl_dentry_lower(struct dentry *dentry)
2621+{
2622+ struct ovl_entry *oe = dentry->d_fsdata;
2623+
2624+ return oe->lowerdentry;
2625+}
2626+
2627+struct dentry *ovl_dentry_real(struct dentry *dentry)
2628+{
2629+ struct ovl_entry *oe = dentry->d_fsdata;
2630+ struct dentry *realdentry;
2631+
2632+ realdentry = ovl_upperdentry_dereference(oe);
2633+ if (!realdentry)
2634+ realdentry = oe->lowerdentry;
2635+
2636+ return realdentry;
2637+}
2638+
2639+struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
2640+{
2641+ struct dentry *realdentry;
2642+
2643+ realdentry = ovl_upperdentry_dereference(oe);
2644+ if (realdentry) {
2645+ *is_upper = true;
2646+ } else {
2647+ realdentry = oe->lowerdentry;
2648+ *is_upper = false;
2649+ }
2650+ return realdentry;
2651+}
2652+
2653+bool ovl_dentry_is_opaque(struct dentry *dentry)
2654+{
2655+ struct ovl_entry *oe = dentry->d_fsdata;
2656+ return oe->opaque;
2657+}
2658+
2659+void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
2660+{
2661+ struct ovl_entry *oe = dentry->d_fsdata;
2662+ oe->opaque = opaque;
2663+}
2664+
2665+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
2666+{
2667+ struct ovl_entry *oe = dentry->d_fsdata;
2668+
2669+ WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
2670+ WARN_ON(oe->__upperdentry);
2671+ BUG_ON(!upperdentry->d_inode);
2672+ smp_wmb();
2673+ oe->__upperdentry = dget(upperdentry);
2674+}
2675+
2676+void ovl_dentry_version_inc(struct dentry *dentry)
2677+{
2678+ struct ovl_entry *oe = dentry->d_fsdata;
2679+
2680+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2681+ oe->version++;
2682+}
2683+
2684+u64 ovl_dentry_version_get(struct dentry *dentry)
2685+{
2686+ struct ovl_entry *oe = dentry->d_fsdata;
2687+
2688+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
2689+ return oe->version;
2690+}
2691+
2692+bool ovl_is_whiteout(struct dentry *dentry)
2693+{
2694+ int res;
2695+ char val;
2696+
2697+ if (!dentry)
2698+ return false;
2699+ if (!dentry->d_inode)
2700+ return false;
2701+ if (!S_ISLNK(dentry->d_inode->i_mode))
2702+ return false;
2703+
2704+ res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1);
2705+ if (res == 1 && val == 'y')
2706+ return true;
2707+
2708+ return false;
2709+}
2710+
2711+static bool ovl_is_opaquedir(struct dentry *dentry)
2712+{
2713+ int res;
2714+ char val;
2715+
2716+ if (!S_ISDIR(dentry->d_inode->i_mode))
2717+ return false;
2718+
2719+ res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1);
2720+ if (res == 1 && val == 'y')
2721+ return true;
2722+
2723+ return false;
2724+}
2725+
2726+static void ovl_entry_free(struct rcu_head *head)
2727+{
2728+ struct ovl_entry *oe = container_of(head, struct ovl_entry, rcu);
2729+ kfree(oe);
2730+}
2731+
2732+static void ovl_dentry_release(struct dentry *dentry)
2733+{
2734+ struct ovl_entry *oe = dentry->d_fsdata;
2735+
2736+ if (oe) {
2737+ dput(oe->__upperdentry);
2738+ dput(oe->__upperdentry);
2739+ dput(oe->lowerdentry);
2740+ call_rcu(&oe->rcu, ovl_entry_free);
2741+ }
2742+}
2743+
2744+const struct dentry_operations ovl_dentry_operations = {
2745+ .d_release = ovl_dentry_release,
2746+};
2747+
2748+static struct ovl_entry *ovl_alloc_entry(void)
2749+{
2750+ return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
2751+}
2752+
2753+static inline struct dentry *ovl_lookup_real(struct dentry *dir,
2754+ struct qstr *name)
2755+{
2756+ struct dentry *dentry;
2757+
2758+ mutex_lock(&dir->d_inode->i_mutex);
2759+ dentry = lookup_one_len(name->name, dir, name->len);
2760+ mutex_unlock(&dir->d_inode->i_mutex);
2761+
2762+ if (IS_ERR(dentry)) {
2763+ if (PTR_ERR(dentry) == -ENOENT)
2764+ dentry = NULL;
2765+ } else if (!dentry->d_inode) {
2766+ dput(dentry);
2767+ dentry = NULL;
2768+ }
2769+ return dentry;
2770+}
2771+
2772+static int ovl_do_lookup(struct dentry *dentry)
2773+{
2774+ struct ovl_entry *oe;
2775+ struct dentry *upperdir;
2776+ struct dentry *lowerdir;
2777+ struct dentry *upperdentry = NULL;
2778+ struct dentry *lowerdentry = NULL;
2779+ struct inode *inode = NULL;
2780+ int err;
2781+
2782+ err = -ENOMEM;
2783+ oe = ovl_alloc_entry();
2784+ if (!oe)
2785+ goto out;
2786+
2787+ upperdir = ovl_dentry_upper(dentry->d_parent);
2788+ lowerdir = ovl_dentry_lower(dentry->d_parent);
2789+
2790+ if (upperdir) {
2791+ upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
2792+ err = PTR_ERR(upperdentry);
2793+ if (IS_ERR(upperdentry))
2794+ goto out_put_dir;
2795+
2796+ if (lowerdir && upperdentry &&
2797+ (S_ISLNK(upperdentry->d_inode->i_mode) ||
2798+ S_ISDIR(upperdentry->d_inode->i_mode))) {
2799+ const struct cred *old_cred;
2800+ struct cred *override_cred;
2801+
2802+ err = -ENOMEM;
2803+ override_cred = prepare_creds();
2804+ if (!override_cred)
2805+ goto out_dput_upper;
2806+
2807+ /* CAP_SYS_ADMIN needed for getxattr */
2808+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
2809+ old_cred = override_creds(override_cred);
2810+
2811+ if (ovl_is_opaquedir(upperdentry)) {
2812+ oe->opaque = true;
2813+ } else if (ovl_is_whiteout(upperdentry)) {
2814+ dput(upperdentry);
2815+ upperdentry = NULL;
2816+ oe->opaque = true;
2817+ }
2818+ revert_creds(old_cred);
2819+ put_cred(override_cred);
2820+ }
2821+ }
2822+ if (lowerdir && !oe->opaque) {
2823+ lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
2824+ err = PTR_ERR(lowerdentry);
2825+ if (IS_ERR(lowerdentry))
2826+ goto out_dput_upper;
2827+ }
2828+
2829+ if (lowerdentry && upperdentry &&
2830+ (!S_ISDIR(upperdentry->d_inode->i_mode) ||
2831+ !S_ISDIR(lowerdentry->d_inode->i_mode))) {
2832+ dput(lowerdentry);
2833+ lowerdentry = NULL;
2834+ oe->opaque = true;
2835+ }
2836+
2837+ if (lowerdentry || upperdentry) {
2838+ struct dentry *realdentry;
2839+
2840+ realdentry = upperdentry ? upperdentry : lowerdentry;
2841+ err = -ENOMEM;
2842+ inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
2843+ oe);
2844+ if (!inode)
2845+ goto out_dput;
2846+ }
2847+
2848+ if (upperdentry)
2849+ oe->__upperdentry = dget(upperdentry);
2850+
2851+ if (lowerdentry)
2852+ oe->lowerdentry = lowerdentry;
2853+
2854+ dentry->d_fsdata = oe;
2855+ dentry->d_op = &ovl_dentry_operations;
2856+ d_add(dentry, inode);
2857+
2858+ return 0;
2859+
2860+out_dput:
2861+ dput(lowerdentry);
2862+out_dput_upper:
2863+ dput(upperdentry);
2864+out_put_dir:
2865+ kfree(oe);
2866+out:
2867+ return err;
2868+}
2869+
2870+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
2871+ struct nameidata *nd)
2872+{
2873+ int err = ovl_do_lookup(dentry);
2874+
2875+ if (err)
2876+ return ERR_PTR(err);
2877+
2878+ return NULL;
2879+}
2880+
2881+struct file *ovl_path_open(struct path *path, int flags)
2882+{
2883+ path_get(path);
2884+ return dentry_open(path->dentry, path->mnt, flags, current_cred());
2885+}
2886+
2887+static void ovl_put_super(struct super_block *sb)
2888+{
2889+ struct ovl_fs *ufs = sb->s_fs_info;
2890+
2891+ if (!(sb->s_flags & MS_RDONLY))
2892+ mnt_drop_write(ufs->upper_mnt);
2893+
2894+ mntput(ufs->upper_mnt);
2895+ mntput(ufs->lower_mnt);
2896+
2897+ kfree(ufs->config.lowerdir);
2898+ kfree(ufs->config.upperdir);
2899+ kfree(ufs);
2900+}
2901+
2902+static int ovl_remount_fs(struct super_block *sb, int *flagsp, char *data)
2903+{
2904+ int flags = *flagsp;
2905+ struct ovl_fs *ufs = sb->s_fs_info;
2906+
2907+ /* When remounting rw or ro, we need to adjust the write access to the
2908+ * upper fs.
2909+ */
2910+ if (((flags ^ sb->s_flags) & MS_RDONLY) == 0)
2911+ /* No change to readonly status */
2912+ return 0;
2913+
2914+ if (flags & MS_RDONLY) {
2915+ mnt_drop_write(ufs->upper_mnt);
2916+ return 0;
2917+ } else
2918+ return mnt_want_write(ufs->upper_mnt);
2919+}
2920+
2921+/**
2922+ * ovl_statfs
2923+ * @sb: The overlayfs super block
2924+ * @buf: The struct kstatfs to fill in with stats
2925+ *
2926+ * Get the filesystem statistics. As writes always target the upper layer
2927+ * filesystem pass the statfs to the same filesystem.
2928+ */
2929+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
2930+{
2931+ struct dentry *root_dentry = dentry->d_sb->s_root;
2932+ struct path path;
2933+ ovl_path_upper(root_dentry, &path);
2934+
2935+ if (!path.dentry->d_sb->s_op->statfs)
2936+ return -ENOSYS;
2937+ return path.dentry->d_sb->s_op->statfs(path.dentry, buf);
2938+}
2939+
2940+/**
2941+ * ovl_show_options
2942+ *
2943+ * Prints the mount options for a given superblock.
2944+ * Returns zero; does not fail.
2945+ */
2946+static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
2947+{
2948+ struct super_block *sb = dentry->d_sb;
2949+ struct ovl_fs *ufs = sb->s_fs_info;
2950+
2951+ seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
2952+ seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
2953+ return 0;
2954+}
2955+
2956+static const struct super_operations ovl_super_operations = {
2957+ .put_super = ovl_put_super,
2958+ .remount_fs = ovl_remount_fs,
2959+ .statfs = ovl_statfs,
2960+ .show_options = ovl_show_options,
2961+};
2962+
2963+enum {
2964+ Opt_lowerdir,
2965+ Opt_upperdir,
2966+ Opt_err,
2967+};
2968+
2969+static const match_table_t ovl_tokens = {
2970+ {Opt_lowerdir, "lowerdir=%s"},
2971+ {Opt_upperdir, "upperdir=%s"},
2972+ {Opt_err, NULL}
2973+};
2974+
2975+static int ovl_parse_opt(char *opt, struct ovl_config *config)
2976+{
2977+ char *p;
2978+
2979+ config->upperdir = NULL;
2980+ config->lowerdir = NULL;
2981+
2982+ while ((p = strsep(&opt, ",")) != NULL) {
2983+ int token;
2984+ substring_t args[MAX_OPT_ARGS];
2985+
2986+ if (!*p)
2987+ continue;
2988+
2989+ token = match_token(p, ovl_tokens, args);
2990+ switch (token) {
2991+ case Opt_upperdir:
2992+ kfree(config->upperdir);
2993+ config->upperdir = match_strdup(&args[0]);
2994+ if (!config->upperdir)
2995+ return -ENOMEM;
2996+ break;
2997+
2998+ case Opt_lowerdir:
2999+ kfree(config->lowerdir);
3000+ config->lowerdir = match_strdup(&args[0]);
3001+ if (!config->lowerdir)
3002+ return -ENOMEM;
3003+ break;
3004+
3005+ default:
3006+ return -EINVAL;
3007+ }
3008+ }
3009+ return 0;
3010+}
3011+
3012+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
3013+{
3014+ struct path lowerpath;
3015+ struct path upperpath;
3016+ struct inode *root_inode;
3017+ struct dentry *root_dentry;
3018+ struct ovl_entry *oe;
3019+ struct ovl_fs *ufs;
3020+ int err;
3021+
3022+ err = -ENOMEM;
3023+ ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL);
3024+ if (!ufs)
3025+ goto out;
3026+
3027+ err = ovl_parse_opt((char *) data, &ufs->config);
3028+ if (err)
3029+ goto out_free_ufs;
3030+
3031+ err = -EINVAL;
3032+ if (!ufs->config.upperdir || !ufs->config.lowerdir) {
3033+ printk(KERN_ERR "overlayfs: missing upperdir or lowerdir\n");
3034+ goto out_free_config;
3035+ }
3036+
3037+ oe = ovl_alloc_entry();
3038+ if (oe == NULL)
3039+ goto out_free_config;
3040+
3041+ root_inode = ovl_new_inode(sb, S_IFDIR, oe);
3042+ if (!root_inode)
3043+ goto out_free_oe;
3044+
3045+ err = kern_path(ufs->config.upperdir, LOOKUP_FOLLOW, &upperpath);
3046+ if (err)
3047+ goto out_put_root;
3048+
3049+ err = kern_path(ufs->config.lowerdir, LOOKUP_FOLLOW, &lowerpath);
3050+ if (err)
3051+ goto out_put_upperpath;
3052+
3053+ err = -ENOTDIR;
3054+ if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
3055+ !S_ISDIR(lowerpath.dentry->d_inode->i_mode))
3056+ goto out_put_lowerpath;
3057+
3058+ sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
3059+ lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
3060+
3061+ err = -EINVAL;
3062+ if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
3063+ printk(KERN_ERR "overlayfs: maximum fs stacking depth exceeded\n");
3064+ goto out_put_lowerpath;
3065+ }
3066+
3067+
3068+ ufs->upper_mnt = clone_private_mount(&upperpath);
3069+ err = PTR_ERR(ufs->upper_mnt);
3070+ if (IS_ERR(ufs->upper_mnt)) {
3071+ printk(KERN_ERR "overlayfs: failed to clone upperpath\n");
3072+ goto out_put_lowerpath;
3073+ }
3074+
3075+ ufs->lower_mnt = clone_private_mount(&lowerpath);
3076+ err = PTR_ERR(ufs->lower_mnt);
3077+ if (IS_ERR(ufs->lower_mnt)) {
3078+ printk(KERN_ERR "overlayfs: failed to clone lowerpath\n");
3079+ goto out_put_upper_mnt;
3080+ }
3081+
3082+ /*
3083+ * Make lower_mnt R/O. That way fchmod/fchown on lower file
3084+ * will fail instead of modifying lower fs.
3085+ */
3086+ ufs->lower_mnt->mnt_flags |= MNT_READONLY;
3087+
3088+ /* If the upper fs is r/o, we mark overlayfs r/o too */
3089+ if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
3090+ sb->s_flags |= MS_RDONLY;
3091+
3092+ if (!(sb->s_flags & MS_RDONLY)) {
3093+ err = mnt_want_write(ufs->upper_mnt);
3094+ if (err)
3095+ goto out_put_lower_mnt;
3096+ }
3097+
3098+ err = -ENOMEM;
3099+ root_dentry = d_alloc_root(root_inode);
3100+ if (!root_dentry)
3101+ goto out_drop_write;
3102+
3103+ mntput(upperpath.mnt);
3104+ mntput(lowerpath.mnt);
3105+
3106+ oe->__upperdentry = dget(upperpath.dentry);
3107+ oe->lowerdentry = lowerpath.dentry;
3108+
3109+ root_dentry->d_fsdata = oe;
3110+ root_dentry->d_op = &ovl_dentry_operations;
3111+
3112+ sb->s_op = &ovl_super_operations;
3113+ sb->s_root = root_dentry;
3114+ sb->s_fs_info = ufs;
3115+
3116+ return 0;
3117+
3118+out_drop_write:
3119+ if (!(sb->s_flags & MS_RDONLY))
3120+ mnt_drop_write(ufs->upper_mnt);
3121+out_put_lower_mnt:
3122+ mntput(ufs->lower_mnt);
3123+out_put_upper_mnt:
3124+ mntput(ufs->upper_mnt);
3125+out_put_lowerpath:
3126+ path_put(&lowerpath);
3127+out_put_upperpath:
3128+ path_put(&upperpath);
3129+out_put_root:
3130+ iput(root_inode);
3131+out_free_oe:
3132+ kfree(oe);
3133+out_free_config:
3134+ kfree(ufs->config.lowerdir);
3135+ kfree(ufs->config.upperdir);
3136+out_free_ufs:
3137+ kfree(ufs);
3138+out:
3139+ return err;
3140+}
3141+
3142+static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
3143+ const char *dev_name, void *raw_data)
3144+{
3145+ return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
3146+}
3147+
3148+static struct file_system_type ovl_fs_type = {
3149+ .owner = THIS_MODULE,
3150+ .name = "overlayfs",
3151+ .mount = ovl_mount,
3152+ .kill_sb = kill_anon_super,
3153+};
3154+
3155+static int __init ovl_init(void)
3156+{
3157+ return register_filesystem(&ovl_fs_type);
3158+}
3159+
3160+static void __exit ovl_exit(void)
3161+{
3162+ unregister_filesystem(&ovl_fs_type);
3163+}
3164+
3165+module_init(ovl_init);
3166+module_exit(ovl_exit);
3167--- a/fs/splice.c
3168+++ b/fs/splice.c
3169@@ -1299,6 +1299,7 @@ long do_splice_direct(struct file *in, l
3170 
3171     return ret;
3172 }
3173+EXPORT_SYMBOL(do_splice_direct);
3174 
3175 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
3176                    struct pipe_inode_info *opipe,
3177--- a/include/linux/fs.h
3178+++ b/include/linux/fs.h
3179@@ -484,6 +484,12 @@ struct iattr {
3180  */
3181 #include <linux/quota.h>
3182 
3183+/*
3184+ * Maximum number of layers of fs stack. Needs to be limited to
3185+ * prevent kernel stack overflow
3186+ */
3187+#define FILESYSTEM_MAX_STACK_DEPTH 2
3188+
3189 /**
3190  * enum positive_aop_returns - aop return codes with specific semantics
3191  *
3192@@ -1496,6 +1502,11 @@ struct super_block {
3193 
3194     /* Being remounted read-only */
3195     int s_readonly_remount;
3196+
3197+ /*
3198+ * Indicates how deep in a filesystem stack this SB is
3199+ */
3200+ int s_stack_depth;
3201 };
3202 
3203 /* superblock cache pruning functions */
3204@@ -1653,6 +1664,8 @@ struct inode_operations {
3205     void (*truncate_range)(struct inode *, loff_t, loff_t);
3206     int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
3207               u64 len);
3208+ struct file *(*open) (struct dentry *, struct file *,
3209+ const struct cred *);
3210 } ____cacheline_aligned;
3211 
3212 struct seq_file;
3213@@ -2023,6 +2036,7 @@ extern long do_sys_open(int dfd, const c
3214 extern struct file *filp_open(const char *, int, umode_t);
3215 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
3216                    const char *, int);
3217+extern struct file *vfs_open(struct path *, struct file *, const struct cred *);
3218 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
3219                  const struct cred *);
3220 extern int filp_close(struct file *, fl_owner_t id);
3221--- a/include/linux/mount.h
3222+++ b/include/linux/mount.h
3223@@ -66,6 +66,9 @@ extern void mnt_pin(struct vfsmount *mnt
3224 extern void mnt_unpin(struct vfsmount *mnt);
3225 extern int __mnt_is_readonly(struct vfsmount *mnt);
3226 
3227+struct path;
3228+extern struct vfsmount *clone_private_mount(struct path *path);
3229+
3230 struct file_system_type;
3231 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
3232                       int flags, const char *name,
3233

Archive Download this file



interactive