Root/target/linux/generic/patches-2.6.38/100-overlayfs.patch

1--- /dev/null
2+++ b/Documentation/filesystems/overlayfs.txt
3@@ -0,0 +1,163 @@
4+Written by: Neil Brown <neilb@suse.de>
5+
6+Overlay Filesystem
7+==================
8+
9+This document describes a prototype for a new approach to providing
10+overlay-filesystem functionality in Linux (sometimes referred to as
11+union-filesystems). An overlay-filesystem tries to present a
12+filesystem which is the result over overlaying one filesystem on top
13+of the other.
14+
15+The result will inevitably fail to look exactly like a normal
16+filesystem for various technical reasons. The expectation is that
17+many use cases will be able to ignore these differences.
18+
19+This approach is 'hybrid' because the objects that appear in the
20+filesystem do not all appear to belong to that filesystem. In many
21+case an object accessed in the union will be indistinguishable
22+from accessing the corresponding object from the original filesystem.
23+This is most obvious from the 'st_dev' field returned by stat(2).
24+
25+While directories will report an st_dev for the overlay-filesystem,
26+all non-directory objects will report an st_dev whichever of the
27+'lower' or 'upper' filesystem that is providing the object. Similarly
28+st_ino will only be unique when combined with st_dev, and both of
29+these can change over the lifetime of a non-directory object. Many
30+applications and tools ignore these values and will not be affected.
31+
32+Upper and Lower
33+---------------
34+
35+An overlay filesystem combines two filesystems - an 'upper' filesystem
36+and a 'lower' filesystem. When a name exists in both filesystems, the
37+object in the 'upper' filesystem is visible while the object in the
38+'lower' filesystem is either hidden or, in the case of directories,
39+merged with the 'upper' object.
40+
41+It would be more correct to refer to an upper and lower 'directory
42+tree' rather than 'filesystem' as it is quite possible for both
43+directory trees to be in the same filesystem and there is no
44+requirement that the root of a filesystem be given for either upper or
45+lower.
46+
47+The lower filesystem can be any filesystem supported by Linux and does
48+not need to be writable. The lower filesystem can even be another
49+overlayfs. The upper filesystem will normally be writable and if it
50+is it must support the creation of trusted.* extended attributes, and
51+must provide valid d_type in readdir responses, at least for symbolic
52+links - so NFS is not suitable.
53+
54+A read-only overlay of two read-only filesystems may use any
55+filesystem type.
56+
57+Directories
58+-----------
59+
60+Overlaying mainly involved directories. If a given name appears in both
61+upper and lower filesystems and refers to a non-directory in either,
62+then the lower object is hidden - the name refers only to the upper
63+object.
64+
65+Where both upper and lower objects are directories, a merged directory
66+is formed.
67+
68+At mount time, the two directories given as mount options are combined
69+into a merged directory. Then whenever a lookup is requested in such
70+a merged directory, the lookup is performed in each actual directory
71+and the combined result is cached in the dentry belonging to the overlay
72+filesystem. If both actual lookups find directories, both are stored
73+and a merged directory is created, otherwise only one is stored: the
74+upper if it exists, else the lower.
75+
76+Only the lists of names from directories are merged. Other content
77+such as metadata and extended attributes are reported for the upper
78+directory only. These attributes of the lower directory are hidden.
79+
80+whiteouts and opaque directories
81+--------------------------------
82+
83+In order to support rm and rmdir without changing the lower
84+filesystem, an overlay filesystem needs to record in the upper filesystem
85+that files have been removed. This is done using whiteouts and opaque
86+directories (non-directories are always opaque).
87+
88+The overlay filesystem uses extended attributes with a
89+"trusted.overlay." prefix to record these details.
90+
91+A whiteout is created as a symbolic link with target
92+"(overlay-whiteout)" and with xattr "trusted.overlay.whiteout" set to "y".
93+When a whiteout is found in the upper level of a merged directory, any
94+matching name in the lower level is ignored, and the whiteout itself
95+is also hidden.
96+
97+A directory is made opaque by setting the xattr "trusted.overlay.opaque"
98+to "y". Where the upper filesystem contains an opaque directory, any
99+directory in the lower filesystem with the same name is ignored.
100+
101+readdir
102+-------
103+
104+When a 'readdir' request is made on a merged directory, the upper and
105+lower directories are each read and the name lists merged in the
106+obvious way (upper is read first, then lower - entries that already
107+exist are not re-added). This merged name list is cached in the
108+'struct file' and so remains as long as the file is kept open. If the
109+directory is opened and read by two processes at the same time, they
110+will each have separate caches. A seekdir to the start of the
111+directory (offset 0) followed by a readdir will cause the cache to be
112+discarded and rebuilt.
113+
114+This means that changes to the merged directory do not appear while a
115+directory is being read. This is unlikely to be noticed by many
116+programs.
117+
118+seek offsets are assigned sequentially when the directories are read.
119+Thus if
120+ - read part of a directory
121+ - remember an offset, and close the directory
122+ - re-open the directory some time later
123+ - seek to the remembered offset
124+
125+there may be little correlation between the old and new locations in
126+the list of filenames, particularly if anything has changed in the
127+directory.
128+
129+Readdir on directories that are not merged is simply handled by the
130+underlying directory (upper or lower).
131+
132+
133+Non-directories
134+---------------
135+
136+Objects that are not directories (files, symlinks, device-special
137+files etc) are presented either from the upper or lower filesystem as
138+appropriate. When a file in the lower filesystem is accessed in a way
139+the requires write-access; such as opening for write access, changing
140+some metadata etc, the file is first copied from the lower filesystem
141+to the upper filesystem (copy_up). Note that creating a hard-link
142+also requires copy-up, though of course creation of a symlink does
143+not.
144+
145+The copy_up process first makes sure that the containing directory
146+exists in the upper filesystem - creating it and any parents as
147+necessary. It then creates the object with the same metadata (owner,
148+mode, mtime, symlink-target etc) and then if the object is a file, the
149+data is copied from the lower to the upper filesystem. Finally any
150+extended attributes are copied up.
151+
152+Once the copy_up is complete, the overlay filesystem simply
153+provides direct access to the newly created file in the upper
154+filesystem - future operations on the file are barely noticed by the
155+overlay filesystem (though an operation on the name of the file such as
156+rename or unlink will of course be noticed and handled).
157+
158+Changes to underlying filesystems
159+---------------------------------
160+
161+Offline changes, when the overlay is not mounted, are allowed to either
162+the upper or the lower trees.
163+
164+Changes to the underlying filesystems while part of a mounted overlay
165+filesystem are not allowed. This is not yet enforced, but will be in
166+the future.
167--- a/fs/Kconfig
168+++ b/fs/Kconfig
169@@ -63,6 +63,7 @@ source "fs/quota/Kconfig"
170 
171 source "fs/autofs4/Kconfig"
172 source "fs/fuse/Kconfig"
173+source "fs/overlayfs/Kconfig"
174 
175 config CUSE
176     tristate "Character device in Userspace support"
177--- a/fs/Makefile
178+++ b/fs/Makefile
179@@ -103,6 +103,7 @@ obj-$(CONFIG_QNX4FS_FS) += qnx4/
180 obj-$(CONFIG_AUTOFS4_FS) += autofs4/
181 obj-$(CONFIG_ADFS_FS) += adfs/
182 obj-$(CONFIG_FUSE_FS) += fuse/
183+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
184 obj-$(CONFIG_UDF_FS) += udf/
185 obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
186 obj-$(CONFIG_OMFS_FS) += omfs/
187--- a/fs/namespace.c
188+++ b/fs/namespace.c
189@@ -1451,6 +1451,23 @@ void drop_collected_mounts(struct vfsmou
190     release_mounts(&umount_list);
191 }
192 
193+struct vfsmount *clone_private_mount(struct path *path)
194+{
195+ struct vfsmount *mnt;
196+
197+ if (IS_MNT_UNBINDABLE(path->mnt))
198+ return ERR_PTR(-EINVAL);
199+
200+ down_read(&namespace_sem);
201+ mnt = clone_mnt(path->mnt, path->dentry, CL_PRIVATE);
202+ up_read(&namespace_sem);
203+ if (!mnt)
204+ return ERR_PTR(-ENOMEM);
205+
206+ return mnt;
207+}
208+EXPORT_SYMBOL_GPL(clone_private_mount);
209+
210 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
211            struct vfsmount *root)
212 {
213--- a/fs/open.c
214+++ b/fs/open.c
215@@ -664,19 +664,19 @@ static inline int __get_file_write_acces
216     return error;
217 }
218 
219-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
220- struct file *f,
221+static struct file *__dentry_open(struct path *path, struct file *f,
222                     int (*open)(struct inode *, struct file *),
223                     const struct cred *cred)
224 {
225     struct inode *inode;
226     int error;
227 
228+ path_get(path);
229     f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
230                 FMODE_PREAD | FMODE_PWRITE;
231- inode = dentry->d_inode;
232+ inode = path->dentry->d_inode;
233     if (f->f_mode & FMODE_WRITE) {
234- error = __get_file_write_access(inode, mnt);
235+ error = __get_file_write_access(inode, path->mnt);
236         if (error)
237             goto cleanup_file;
238         if (!special_file(inode->i_mode))
239@@ -684,8 +684,7 @@ static struct file *__dentry_open(struct
240     }
241 
242     f->f_mapping = inode->i_mapping;
243- f->f_path.dentry = dentry;
244- f->f_path.mnt = mnt;
245+ f->f_path = *path;
246     f->f_pos = 0;
247     f->f_op = fops_get(inode->i_fop);
248     file_sb_list_add(f, inode->i_sb);
249@@ -731,7 +730,7 @@ cleanup_all:
250              * here, so just reset the state.
251              */
252             file_reset_write(f);
253- mnt_drop_write(mnt);
254+ mnt_drop_write(path->mnt);
255         }
256     }
257     file_sb_list_del(f);
258@@ -739,8 +738,7 @@ cleanup_all:
259     f->f_path.mnt = NULL;
260 cleanup_file:
261     put_filp(f);
262- dput(dentry);
263- mntput(mnt);
264+ path_put(path);
265     return ERR_PTR(error);
266 }
267 
268@@ -766,14 +764,14 @@ cleanup_file:
269 struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry,
270         int (*open)(struct inode *, struct file *))
271 {
272+ struct path path = { .dentry = dentry, .mnt = nd->path.mnt };
273     const struct cred *cred = current_cred();
274 
275     if (IS_ERR(nd->intent.open.file))
276         goto out;
277     if (IS_ERR(dentry))
278         goto out_err;
279- nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
280- nd->intent.open.file,
281+ nd->intent.open.file = __dentry_open(&path, nd->intent.open.file,
282                          open, cred);
283 out:
284     return nd->intent.open.file;
285@@ -802,10 +800,17 @@ struct file *nameidata_to_filp(struct na
286 
287     /* Has the filesystem initialised the file for us? */
288     if (filp->f_path.dentry == NULL) {
289- path_get(&nd->path);
290- filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
291- NULL, cred);
292+ struct inode *inode = nd->path.dentry->d_inode;
293+
294+ if (inode->i_op->open) {
295+ int flags = filp->f_flags;
296+ put_filp(filp);
297+ filp = inode->i_op->open(nd->path.dentry, flags, cred);
298+ } else {
299+ filp = __dentry_open(&nd->path, filp, NULL, cred);
300+ }
301     }
302+
303     return filp;
304 }
305 
306@@ -816,35 +821,45 @@ struct file *nameidata_to_filp(struct na
307 struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
308              const struct cred *cred)
309 {
310- int error;
311+ struct path path = { .dentry = dentry, .mnt = mnt };
312+ struct file *ret;
313+
314+ BUG_ON(!mnt);
315+
316+ ret = vfs_open(&path, flags, cred);
317+ path_put(&path);
318+
319+ return ret;
320+}
321+EXPORT_SYMBOL(dentry_open);
322+
323+/**
324+ * vfs_open - open the file at the given path
325+ * @path: path to open
326+ * @flags: open flags
327+ * @cred: credentials to use
328+ *
329+ * Open the file. If successful, the returned file will have acquired
330+ * an additional reference for path.
331+ */
332+struct file *vfs_open(struct path *path, int flags, const struct cred *cred)
333+{
334     struct file *f;
335+ struct inode *inode = path->dentry->d_inode;
336 
337     validate_creds(cred);
338 
339- /*
340- * We must always pass in a valid mount pointer. Historically
341- * callers got away with not passing it, but we must enforce this at
342- * the earliest possible point now to avoid strange problems deep in the
343- * filesystem stack.
344- */
345- if (!mnt) {
346- printk(KERN_WARNING "%s called with NULL vfsmount\n", __func__);
347- dump_stack();
348- return ERR_PTR(-EINVAL);
349- }
350+ if (inode->i_op->open)
351+ return inode->i_op->open(path->dentry, flags, cred);
352 
353- error = -ENFILE;
354     f = get_empty_filp();
355- if (f == NULL) {
356- dput(dentry);
357- mntput(mnt);
358- return ERR_PTR(error);
359- }
360+ if (f == NULL)
361+ return ERR_PTR(-ENFILE);
362 
363     f->f_flags = flags;
364- return __dentry_open(dentry, mnt, f, NULL, cred);
365+ return __dentry_open(path, f, NULL, cred);
366 }
367-EXPORT_SYMBOL(dentry_open);
368+EXPORT_SYMBOL(vfs_open);
369 
370 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
371 {
372--- /dev/null
373+++ b/fs/overlayfs/Kconfig
374@@ -0,0 +1,4 @@
375+config OVERLAYFS_FS
376+ tristate "Overlay filesystem support"
377+ help
378+ Add support for overlay filesystem.
379--- /dev/null
380+++ b/fs/overlayfs/Makefile
381@@ -0,0 +1,5 @@
382+#
383+# Makefile for the overlay filesystem.
384+#
385+
386+obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
387--- /dev/null
388+++ b/fs/overlayfs/overlayfs.c
389@@ -0,0 +1,2358 @@
390+#include <linux/fs.h>
391+#include <linux/namei.h>
392+#include <linux/sched.h>
393+#include <linux/fs_struct.h>
394+#include <linux/file.h>
395+#include <linux/xattr.h>
396+#include <linux/security.h>
397+#include <linux/device_cgroup.h>
398+#include <linux/mount.h>
399+#include <linux/splice.h>
400+#include <linux/slab.h>
401+#include <linux/parser.h>
402+#include <linux/module.h>
403+#include <linux/uaccess.h>
404+#include <linux/rbtree.h>
405+
406+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
407+MODULE_DESCRIPTION("Overlay filesystem");
408+MODULE_LICENSE("GPL");
409+
410+#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
411+
412+struct ovl_fs {
413+ struct vfsmount *upper_mnt;
414+ struct vfsmount *lower_mnt;
415+};
416+
417+struct ovl_entry {
418+ struct dentry *__upperdentry;
419+ struct dentry *lowerdentry;
420+ union {
421+ struct {
422+ u64 version;
423+ bool opaque;
424+ };
425+ struct rcu_head rcu;
426+ };
427+};
428+
429+static const char *ovl_whiteout_xattr = "trusted.overlay.whiteout";
430+static const char *ovl_opaque_xattr = "trusted.overlay.opaque";
431+static const char *ovl_whiteout_symlink = "(overlay-whiteout)";
432+
433+enum ovl_path_type {
434+ OVL_PATH_UPPER,
435+ OVL_PATH_MERGE,
436+ OVL_PATH_LOWER,
437+};
438+
439+static enum ovl_path_type ovl_path_type(struct dentry *dentry)
440+{
441+ struct ovl_entry *oe = dentry->d_fsdata;
442+
443+ if (oe->__upperdentry) {
444+ if (oe->lowerdentry && S_ISDIR(dentry->d_inode->i_mode))
445+ return OVL_PATH_MERGE;
446+ else
447+ return OVL_PATH_UPPER;
448+ } else {
449+ return OVL_PATH_LOWER;
450+ }
451+}
452+
453+static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
454+{
455+ struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
456+ smp_read_barrier_depends();
457+ return upperdentry;
458+}
459+
460+static void ovl_path_upper(struct dentry *dentry, struct path *path)
461+{
462+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
463+ struct ovl_entry *oe = dentry->d_fsdata;
464+
465+ path->mnt = ofs->upper_mnt;
466+ path->dentry = ovl_upperdentry_dereference(oe);
467+}
468+
469+static void ovl_path_lower(struct dentry *dentry, struct path *path)
470+{
471+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
472+ struct ovl_entry *oe = dentry->d_fsdata;
473+
474+ path->mnt = ofs->lower_mnt;
475+ path->dentry = oe->lowerdentry;
476+}
477+
478+static enum ovl_path_type ovl_path_real(struct dentry *dentry,
479+ struct path *path)
480+{
481+
482+ enum ovl_path_type type = ovl_path_type(dentry);
483+
484+ if (type == OVL_PATH_LOWER)
485+ ovl_path_lower(dentry, path);
486+ else
487+ ovl_path_upper(dentry, path);
488+
489+ return type;
490+}
491+
492+static struct dentry *ovl_dentry_upper(struct dentry *dentry)
493+{
494+ struct ovl_entry *oe = dentry->d_fsdata;
495+
496+ return ovl_upperdentry_dereference(oe);
497+}
498+
499+static struct dentry *ovl_dentry_lower(struct dentry *dentry)
500+{
501+ struct ovl_entry *oe = dentry->d_fsdata;
502+
503+ return oe->lowerdentry;
504+}
505+
506+static struct dentry *ovl_dentry_real(struct dentry *dentry)
507+{
508+ struct ovl_entry *oe = dentry->d_fsdata;
509+ struct dentry *realdentry;
510+
511+ realdentry = ovl_upperdentry_dereference(oe);
512+ if (!realdentry)
513+ realdentry = oe->lowerdentry;
514+
515+ return realdentry;
516+}
517+
518+static bool ovl_dentry_is_opaque(struct dentry *dentry)
519+{
520+ struct ovl_entry *oe = dentry->d_fsdata;
521+ return oe->opaque;
522+}
523+
524+static void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
525+{
526+ struct ovl_entry *oe = dentry->d_fsdata;
527+ oe->opaque = opaque;
528+}
529+
530+static void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
531+{
532+ struct ovl_entry *oe = dentry->d_fsdata;
533+
534+ WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
535+ WARN_ON(oe->__upperdentry);
536+ smp_wmb();
537+ oe->__upperdentry = upperdentry;
538+}
539+
540+static void ovl_dentry_version_inc(struct dentry *dentry)
541+{
542+ struct ovl_entry *oe = dentry->d_fsdata;
543+
544+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
545+ oe->version++;
546+}
547+
548+static u64 ovl_dentry_version_get(struct dentry *dentry)
549+{
550+ struct ovl_entry *oe = dentry->d_fsdata;
551+
552+ WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
553+ return oe->version;
554+}
555+
556+static bool ovl_is_whiteout(struct dentry *dentry)
557+{
558+ int res;
559+ char val;
560+
561+ if (!dentry)
562+ return false;
563+ if (!dentry->d_inode)
564+ return false;
565+ if (!S_ISLNK(dentry->d_inode->i_mode))
566+ return false;
567+
568+ res = vfs_getxattr(dentry, ovl_whiteout_xattr, &val, 1);
569+ if (res == 1 && val == 'y')
570+ return true;
571+
572+ return false;
573+}
574+
575+static bool ovl_is_opaquedir(struct dentry *dentry)
576+{
577+ int res;
578+ char val;
579+
580+ if (!S_ISDIR(dentry->d_inode->i_mode))
581+ return false;
582+
583+ res = vfs_getxattr(dentry, ovl_opaque_xattr, &val, 1);
584+ if (res == 1 && val == 'y')
585+ return true;
586+
587+ return false;
588+}
589+
590+struct ovl_cache_entry {
591+ const char *name;
592+ unsigned int len;
593+ unsigned int type;
594+ u64 ino;
595+ bool is_whiteout;
596+ struct list_head l_node;
597+ struct rb_node node;
598+};
599+
600+struct ovl_readdir_data {
601+ struct rb_root *root;
602+ struct list_head *list;
603+ struct list_head *middle;
604+ struct dentry *dir;
605+ int count;
606+ int err;
607+};
608+
609+struct ovl_dir_file {
610+ bool is_real;
611+ bool is_cached;
612+ struct list_head cursor;
613+ u64 cache_version;
614+ struct list_head cache;
615+ struct file *realfile;
616+};
617+
618+static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
619+{
620+ return container_of(n, struct ovl_cache_entry, node);
621+}
622+
623+static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
624+ const char *name, int len)
625+{
626+ struct rb_node *node = root->rb_node;
627+ int cmp;
628+
629+ while (node) {
630+ struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
631+
632+ cmp = strncmp(name, p->name, len);
633+ if (cmp > 0)
634+ node = p->node.rb_right;
635+ else if (cmp < 0 || len < p->len)
636+ node = p->node.rb_left;
637+ else
638+ return p;
639+ }
640+
641+ return NULL;
642+}
643+
644+static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
645+ u64 ino, unsigned int d_type,
646+ bool is_whiteout)
647+{
648+ struct ovl_cache_entry *p;
649+
650+ p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL);
651+ if (p) {
652+ char *name_copy = (char *) (p + 1);
653+ memcpy(name_copy, name, len);
654+ name_copy[len] = '\0';
655+ p->name = name_copy;
656+ p->len = len;
657+ p->type = d_type;
658+ p->ino = ino;
659+ p->is_whiteout = is_whiteout;
660+ }
661+
662+ return p;
663+}
664+
665+static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
666+ const char *name, int len, u64 ino,
667+ unsigned int d_type, bool is_whiteout)
668+{
669+ struct rb_node **newp = &rdd->root->rb_node;
670+ struct rb_node *parent = NULL;
671+ struct ovl_cache_entry *p;
672+
673+ while (*newp) {
674+ int cmp;
675+ struct ovl_cache_entry *tmp;
676+
677+ parent = *newp;
678+ tmp = ovl_cache_entry_from_node(*newp);
679+ cmp = strncmp(name, tmp->name, len);
680+ if (cmp > 0)
681+ newp = &tmp->node.rb_right;
682+ else if (cmp < 0 || len < tmp->len)
683+ newp = &tmp->node.rb_left;
684+ else
685+ return 0;
686+ }
687+
688+ p = ovl_cache_entry_new(name, len, ino, d_type, is_whiteout);
689+ if (p == NULL)
690+ return -ENOMEM;
691+
692+ list_add_tail(&p->l_node, rdd->list);
693+ rb_link_node(&p->node, parent, newp);
694+ rb_insert_color(&p->node, rdd->root);
695+
696+ return 0;
697+}
698+
699+static int ovl_fill_lower(void *buf, const char *name, int namelen,
700+ loff_t offset, u64 ino, unsigned int d_type)
701+{
702+ struct ovl_readdir_data *rdd = buf;
703+ struct ovl_cache_entry *p;
704+
705+ rdd->count++;
706+ p = ovl_cache_entry_find(rdd->root, name, namelen);
707+ if (p) {
708+ list_move_tail(&p->l_node, rdd->middle);
709+ } else {
710+ p = ovl_cache_entry_new(name, namelen, ino, d_type, false);
711+ if (p == NULL)
712+ rdd->err = -ENOMEM;
713+ else
714+ list_add_tail(&p->l_node, rdd->middle);
715+ }
716+
717+ return rdd->err;
718+}
719+
720+static void ovl_cache_free(struct list_head *list)
721+{
722+ struct ovl_cache_entry *p;
723+ struct ovl_cache_entry *n;
724+
725+ list_for_each_entry_safe(p, n, list, l_node)
726+ kfree(p);
727+
728+ INIT_LIST_HEAD(list);
729+}
730+
731+static int ovl_fill_upper(void *buf, const char *name, int namelen,
732+ loff_t offset, u64 ino, unsigned int d_type)
733+{
734+ struct ovl_readdir_data *rdd = buf;
735+ bool is_whiteout = false;
736+
737+ rdd->count++;
738+ if (d_type == DT_LNK) {
739+ struct dentry *dentry;
740+
741+ dentry = lookup_one_len(name, rdd->dir, namelen);
742+ if (IS_ERR(dentry)) {
743+ rdd->err = PTR_ERR(dentry);
744+ goto out;
745+ }
746+ is_whiteout = ovl_is_whiteout(dentry);
747+ dput(dentry);
748+ }
749+
750+ rdd->err = ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type,
751+ is_whiteout);
752+
753+out:
754+ return rdd->err;
755+}
756+
757+static int ovl_dir_read(struct path *realpath, struct ovl_readdir_data *rdd,
758+ filldir_t filler)
759+{
760+ const struct cred *old_cred;
761+ struct cred *override_cred;
762+ struct file *realfile;
763+ int err;
764+
765+ realfile = vfs_open(realpath, O_RDONLY | O_DIRECTORY, current_cred());
766+ if (IS_ERR(realfile))
767+ return PTR_ERR(realfile);
768+
769+ err = -ENOMEM;
770+ override_cred = prepare_creds();
771+ if (override_cred) {
772+ /*
773+ * CAP_SYS_ADMIN for getxattr
774+ * CAP_DAC_OVERRIDE for lookup and unlink
775+ */
776+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
777+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
778+ old_cred = override_creds(override_cred);
779+
780+ do {
781+ rdd->count = 0;
782+ rdd->err = 0;
783+ err = vfs_readdir(realfile, filler, rdd);
784+ if (err >= 0)
785+ err = rdd->err;
786+ } while (!err && rdd->count);
787+
788+ revert_creds(old_cred);
789+ put_cred(override_cred);
790+ }
791+ fput(realfile);
792+
793+ if (err) {
794+ if (rdd->list)
795+ ovl_cache_free(rdd->list);
796+ return err;
797+ }
798+
799+ return 0;
800+}
801+
802+static void ovl_dir_reset(struct file *file)
803+{
804+ struct ovl_dir_file *od = file->private_data;
805+ enum ovl_path_type type = ovl_path_type(file->f_path.dentry);
806+
807+ if (ovl_dentry_version_get(file->f_path.dentry) != od->cache_version) {
808+ list_del_init(&od->cursor);
809+ ovl_cache_free(&od->cache);
810+ od->is_cached = false;
811+ }
812+ WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
813+ if (od->is_real && type == OVL_PATH_MERGE) {
814+ fput(od->realfile);
815+ od->realfile = NULL;
816+ od->is_real = false;
817+ }
818+}
819+
820+static int ovl_dir_read_merged(struct path *upperpath, struct path *lowerpath,
821+ struct ovl_readdir_data *rdd)
822+{
823+ int err;
824+ struct rb_root root = RB_ROOT;
825+ struct list_head middle;
826+
827+ rdd->root = &root;
828+ if (upperpath->dentry) {
829+ rdd->dir = upperpath->dentry;
830+ err = ovl_dir_read(upperpath, rdd, ovl_fill_upper);
831+ if (err)
832+ goto out;
833+ }
834+ /*
835+ * Insert lowerpath entries before upperpath ones, this allows
836+ * offsets to be reasonably constant
837+ */
838+ list_add(&middle, rdd->list);
839+ rdd->middle = &middle;
840+ err = ovl_dir_read(lowerpath, rdd, ovl_fill_lower);
841+ list_del(&middle);
842+out:
843+ rdd->root = NULL;
844+
845+ return err;
846+}
847+
848+static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
849+{
850+ struct list_head *l;
851+ loff_t off;
852+
853+ l = od->cache.next;
854+ for (off = 0; off < pos; off++) {
855+ if (l == &od->cache)
856+ break;
857+ l = l->next;
858+ }
859+ list_move_tail(&od->cursor, l);
860+}
861+
862+static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
863+{
864+ struct ovl_dir_file *od = file->private_data;
865+ int res;
866+
867+ if (!file->f_pos)
868+ ovl_dir_reset(file);
869+
870+ if (od->is_real) {
871+ res = vfs_readdir(od->realfile, filler, buf);
872+ file->f_pos = od->realfile->f_pos;
873+
874+ return res;
875+ }
876+
877+ if (!od->is_cached) {
878+ struct path lowerpath;
879+ struct path upperpath;
880+ struct ovl_readdir_data rdd = { .list = &od->cache };
881+
882+ ovl_path_lower(file->f_path.dentry, &lowerpath);
883+ ovl_path_upper(file->f_path.dentry, &upperpath);
884+
885+ res = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
886+ if (res)
887+ return res;
888+
889+ od->cache_version = ovl_dentry_version_get(file->f_path.dentry);
890+ od->is_cached = true;
891+
892+ ovl_seek_cursor(od, file->f_pos);
893+ }
894+
895+ while (od->cursor.next != &od->cache) {
896+ int over;
897+ loff_t off;
898+ struct ovl_cache_entry *p;
899+
900+ p = list_entry(od->cursor.next, struct ovl_cache_entry, l_node);
901+ off = file->f_pos;
902+ file->f_pos++;
903+ list_move(&od->cursor, &p->l_node);
904+
905+ if (p->is_whiteout)
906+ continue;
907+
908+ over = filler(buf, p->name, p->len, off, p->ino, p->type);
909+ if (over)
910+ break;
911+ }
912+
913+ return 0;
914+}
915+
916+static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
917+{
918+ loff_t res;
919+ struct ovl_dir_file *od = file->private_data;
920+
921+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
922+ if (!file->f_pos)
923+ ovl_dir_reset(file);
924+
925+ if (od->is_real) {
926+ res = vfs_llseek(od->realfile, offset, origin);
927+ file->f_pos = od->realfile->f_pos;
928+ } else {
929+ res = -EINVAL;
930+
931+ switch (origin) {
932+ case SEEK_CUR:
933+ offset += file->f_pos;
934+ break;
935+ case SEEK_SET:
936+ break;
937+ default:
938+ goto out_unlock;
939+ }
940+ if (offset < 0)
941+ goto out_unlock;
942+
943+ if (offset != file->f_pos) {
944+ file->f_pos = offset;
945+ if (od->is_cached)
946+ ovl_seek_cursor(od, offset);
947+ }
948+ res = offset;
949+ }
950+out_unlock:
951+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
952+
953+ return res;
954+}
955+
956+static int ovl_dir_fsync(struct file *file, int datasync)
957+{
958+ struct ovl_dir_file *od = file->private_data;
959+
960+ /* May need to reopen directory if it got copied up */
961+ if (!od->realfile) {
962+ struct path upperpath;
963+
964+ ovl_path_upper(file->f_path.dentry, &upperpath);
965+ od->realfile = vfs_open(&upperpath, O_RDONLY, current_cred());
966+ if (IS_ERR(od->realfile))
967+ return PTR_ERR(od->realfile);
968+ }
969+
970+ return vfs_fsync(od->realfile, datasync);
971+}
972+
973+static int ovl_dir_release(struct inode *inode, struct file *file)
974+{
975+ struct ovl_dir_file *od = file->private_data;
976+
977+ list_del(&od->cursor);
978+ ovl_cache_free(&od->cache);
979+ if (od->realfile)
980+ fput(od->realfile);
981+ kfree(od);
982+
983+ return 0;
984+}
985+
986+static int ovl_dir_open(struct inode *inode, struct file *file)
987+{
988+ struct path realpath;
989+ struct file *realfile;
990+ struct ovl_dir_file *od;
991+ enum ovl_path_type type;
992+
993+ od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
994+ if (!od)
995+ return -ENOMEM;
996+
997+ type = ovl_path_real(file->f_path.dentry, &realpath);
998+ realfile = vfs_open(&realpath, file->f_flags, current_cred());
999+ if (IS_ERR(realfile)) {
1000+ kfree(od);
1001+ return PTR_ERR(realfile);
1002+ }
1003+ INIT_LIST_HEAD(&od->cache);
1004+ INIT_LIST_HEAD(&od->cursor);
1005+ od->is_cached = false;
1006+ od->realfile = realfile;
1007+ od->is_real = (type != OVL_PATH_MERGE);
1008+ file->private_data = od;
1009+
1010+ return 0;
1011+}
1012+
1013+static const struct file_operations ovl_dir_operations = {
1014+ .read = generic_read_dir,
1015+ .open = ovl_dir_open,
1016+ .readdir = ovl_readdir,
1017+ .llseek = ovl_dir_llseek,
1018+ .fsync = ovl_dir_fsync,
1019+ .release = ovl_dir_release,
1020+};
1021+
1022+static const struct inode_operations ovl_dir_inode_operations;
1023+
1024+static void ovl_entry_free(struct rcu_head *head)
1025+{
1026+ struct ovl_entry *oe = container_of(head, struct ovl_entry, rcu);
1027+ kfree(oe);
1028+}
1029+
1030+static void ovl_dentry_release(struct dentry *dentry)
1031+{
1032+ struct ovl_entry *oe = dentry->d_fsdata;
1033+
1034+ if (oe) {
1035+ dput(oe->__upperdentry);
1036+ dput(oe->lowerdentry);
1037+ call_rcu(&oe->rcu, ovl_entry_free);
1038+ }
1039+}
1040+
1041+static const struct dentry_operations ovl_dentry_operations = {
1042+ .d_release = ovl_dentry_release,
1043+};
1044+
1045+static struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name)
1046+{
1047+ struct dentry *dentry;
1048+
1049+ mutex_lock(&dir->d_inode->i_mutex);
1050+ dentry = lookup_one_len(name->name, dir, name->len);
1051+ mutex_unlock(&dir->d_inode->i_mutex);
1052+
1053+ if (IS_ERR(dentry)) {
1054+ if (PTR_ERR(dentry) == -ENOENT)
1055+ dentry = NULL;
1056+ } else if (!dentry->d_inode) {
1057+ dput(dentry);
1058+ dentry = NULL;
1059+ }
1060+ return dentry;
1061+}
1062+
1063+static struct ovl_entry *ovl_alloc_entry(void)
1064+{
1065+ return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
1066+}
1067+
1068+static struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
1069+ struct ovl_entry *oe);
1070+
1071+static struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
1072+ struct nameidata *nd)
1073+{
1074+ struct ovl_entry *oe;
1075+ struct dentry *upperdir;
1076+ struct dentry *lowerdir;
1077+ struct dentry *upperdentry = NULL;
1078+ struct dentry *lowerdentry = NULL;
1079+ struct inode *inode = NULL;
1080+ int err;
1081+
1082+ err = -ENOMEM;
1083+ oe = ovl_alloc_entry();
1084+ if (!oe)
1085+ goto out;
1086+
1087+ upperdir = ovl_dentry_upper(dentry->d_parent);
1088+ lowerdir = ovl_dentry_lower(dentry->d_parent);
1089+
1090+ if (upperdir) {
1091+ upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
1092+ err = PTR_ERR(upperdentry);
1093+ if (IS_ERR(upperdentry))
1094+ goto out_put_dir;
1095+
1096+ if (lowerdir && upperdentry &&
1097+ (S_ISLNK(upperdentry->d_inode->i_mode) ||
1098+ S_ISDIR(upperdentry->d_inode->i_mode))) {
1099+ const struct cred *old_cred;
1100+ struct cred *override_cred;
1101+
1102+ err = -ENOMEM;
1103+ override_cred = prepare_creds();
1104+ if (!override_cred)
1105+ goto out_dput_upper;
1106+
1107+ /* CAP_SYS_ADMIN needed for getxattr */
1108+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1109+ old_cred = override_creds(override_cred);
1110+
1111+ if (ovl_is_opaquedir(upperdentry)) {
1112+ oe->opaque = true;
1113+ } else if (ovl_is_whiteout(upperdentry)) {
1114+ dput(upperdentry);
1115+ upperdentry = NULL;
1116+ oe->opaque = true;
1117+ }
1118+ revert_creds(old_cred);
1119+ put_cred(override_cred);
1120+ }
1121+ }
1122+ if (lowerdir && !oe->opaque) {
1123+ lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
1124+ err = PTR_ERR(lowerdentry);
1125+ if (IS_ERR(lowerdentry))
1126+ goto out_dput_upper;
1127+ }
1128+
1129+ if (lowerdentry && upperdentry &&
1130+ (!S_ISDIR(upperdentry->d_inode->i_mode) ||
1131+ !S_ISDIR(lowerdentry->d_inode->i_mode))) {
1132+ dput(lowerdentry);
1133+ lowerdentry = NULL;
1134+ oe->opaque = true;
1135+ }
1136+
1137+ if (lowerdentry || upperdentry) {
1138+ struct dentry *realdentry;
1139+
1140+ realdentry = upperdentry ? upperdentry : lowerdentry;
1141+ err = -ENOMEM;
1142+ inode = ovl_new_inode(dir->i_sb, realdentry->d_inode->i_mode, oe);
1143+ if (!inode)
1144+ goto out_dput;
1145+ }
1146+
1147+ if (upperdentry)
1148+ oe->__upperdentry = upperdentry;
1149+
1150+ if (lowerdentry)
1151+ oe->lowerdentry = lowerdentry;
1152+
1153+ dentry->d_fsdata = oe;
1154+ dentry->d_op = &ovl_dentry_operations;
1155+ d_add(dentry, inode);
1156+
1157+ return NULL;
1158+
1159+out_dput:
1160+ dput(lowerdentry);
1161+out_dput_upper:
1162+ dput(upperdentry);
1163+out_put_dir:
1164+ kfree(oe);
1165+out:
1166+ return ERR_PTR(err);
1167+}
1168+
1169+static int ovl_copy_up_xattr(struct dentry *old, struct dentry *new)
1170+{
1171+ ssize_t list_size, size;
1172+ char *buf, *name, *value;
1173+ int error;
1174+
1175+ if (!old->d_inode->i_op->getxattr ||
1176+ !new->d_inode->i_op->getxattr)
1177+ return 0;
1178+
1179+ list_size = vfs_listxattr(old, NULL, 0);
1180+ if (list_size <= 0) {
1181+ if (list_size == -EOPNOTSUPP)
1182+ return 0;
1183+ return list_size;
1184+ }
1185+
1186+ buf = kzalloc(list_size, GFP_KERNEL);
1187+ if (!buf)
1188+ return -ENOMEM;
1189+
1190+ error = -ENOMEM;
1191+ value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
1192+ if (!value)
1193+ goto out;
1194+
1195+ list_size = vfs_listxattr(old, buf, list_size);
1196+ if (list_size <= 0) {
1197+ error = list_size;
1198+ goto out_free_value;
1199+ }
1200+
1201+ for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
1202+ size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
1203+ if (size <= 0) {
1204+ error = size;
1205+ goto out_free_value;
1206+ }
1207+ error = vfs_setxattr(new, name, value, size, 0);
1208+ if (error)
1209+ goto out_free_value;
1210+ }
1211+
1212+out_free_value:
1213+ kfree(value);
1214+out:
1215+ kfree(buf);
1216+ return error;
1217+}
1218+
1219+static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
1220+{
1221+ struct file *old_file;
1222+ struct file *new_file;
1223+ int error = 0;
1224+
1225+ if (len == 0)
1226+ return 0;
1227+
1228+ old_file = vfs_open(old, O_RDONLY, current_cred());
1229+ if (IS_ERR(old_file))
1230+ return PTR_ERR(old_file);
1231+
1232+ new_file = vfs_open(new, O_WRONLY, current_cred());
1233+ if (IS_ERR(new_file)) {
1234+ error = PTR_ERR(new_file);
1235+ goto out_fput;
1236+ }
1237+
1238+ /* FIXME: copy up sparse files efficiently */
1239+ while (len) {
1240+ loff_t offset = new_file->f_pos;
1241+ size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
1242+ long bytes;
1243+
1244+ if (len < this_len)
1245+ this_len = len;
1246+
1247+ if (signal_pending_state(TASK_KILLABLE, current))
1248+ return -EINTR;
1249+
1250+ bytes = do_splice_direct(old_file, &offset, new_file, this_len,
1251+ SPLICE_F_MOVE);
1252+ if (bytes <= 0) {
1253+ error = bytes;
1254+ break;
1255+ }
1256+
1257+ len -= bytes;
1258+ }
1259+
1260+ fput(new_file);
1261+out_fput:
1262+ fput(old_file);
1263+ return error;
1264+}
1265+
1266+static struct dentry *ovl_lookup_create(struct dentry *upperdir,
1267+ struct dentry *template)
1268+{
1269+ int err;
1270+ struct dentry *newdentry;
1271+ struct qstr *name = &template->d_name;
1272+
1273+ newdentry = lookup_one_len(name->name, upperdir, name->len);
1274+ if (IS_ERR(newdentry))
1275+ return newdentry;
1276+
1277+ if (newdentry->d_inode) {
1278+ const struct cred *old_cred;
1279+ struct cred *override_cred;
1280+
1281+ /* No need to check whiteout if lower parent is non-existent */
1282+ err = -EEXIST;
1283+ if (!ovl_dentry_lower(template->d_parent))
1284+ goto out_dput;
1285+
1286+ if (!S_ISLNK(newdentry->d_inode->i_mode))
1287+ goto out_dput;
1288+
1289+ err = -ENOMEM;
1290+ override_cred = prepare_creds();
1291+ if (!override_cred)
1292+ goto out_dput;
1293+
1294+ /*
1295+ * CAP_SYS_ADMIN for getxattr
1296+ * CAP_FOWNER for unlink in sticky directory
1297+ */
1298+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1299+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
1300+ old_cred = override_creds(override_cred);
1301+
1302+ err = -EEXIST;
1303+ if (ovl_is_whiteout(newdentry))
1304+ err = vfs_unlink(upperdir->d_inode, newdentry);
1305+
1306+ revert_creds(old_cred);
1307+ put_cred(override_cred);
1308+ if (err)
1309+ goto out_dput;
1310+
1311+ dput(newdentry);
1312+ newdentry = lookup_one_len(name->name, upperdir, name->len);
1313+ if (IS_ERR(newdentry))
1314+ return newdentry;
1315+
1316+ /*
1317+ * Whiteout just been successfully removed, parent
1318+ * i_mutex is still held, there's no way the lookup
1319+ * could return positive.
1320+ */
1321+ WARN_ON(newdentry->d_inode);
1322+ }
1323+
1324+ return newdentry;
1325+
1326+out_dput:
1327+ dput(newdentry);
1328+ return ERR_PTR(err);
1329+}
1330+
1331+static struct dentry *ovl_upper_create(struct dentry *upperdir,
1332+ struct dentry *dentry,
1333+ struct kstat *stat, const char *link)
1334+{
1335+ int err;
1336+ struct dentry *newdentry;
1337+ struct inode *dir = upperdir->d_inode;
1338+
1339+ newdentry = ovl_lookup_create(upperdir, dentry);
1340+ if (IS_ERR(newdentry))
1341+ goto out;
1342+
1343+ switch (stat->mode & S_IFMT) {
1344+ case S_IFREG:
1345+ err = vfs_create(dir, newdentry, stat->mode, NULL);
1346+ break;
1347+
1348+ case S_IFDIR:
1349+ err = vfs_mkdir(dir, newdentry, stat->mode);
1350+ break;
1351+
1352+ case S_IFCHR:
1353+ case S_IFBLK:
1354+ case S_IFIFO:
1355+ case S_IFSOCK:
1356+ err = vfs_mknod(dir, newdentry, stat->mode, stat->rdev);
1357+ break;
1358+
1359+ case S_IFLNK:
1360+ err = vfs_symlink(dir, newdentry, link);
1361+ break;
1362+
1363+ default:
1364+ err = -EPERM;
1365+ }
1366+ if (err) {
1367+ dput(newdentry);
1368+ newdentry = ERR_PTR(err);
1369+ }
1370+
1371+out:
1372+ return newdentry;
1373+
1374+}
1375+
1376+static char *ovl_read_symlink(struct dentry *realdentry)
1377+{
1378+ int res;
1379+ char *buf;
1380+ struct inode *inode = realdentry->d_inode;
1381+ mm_segment_t old_fs;
1382+
1383+ res = -EINVAL;
1384+ if (!inode->i_op->readlink)
1385+ goto err;
1386+
1387+ res = -ENOMEM;
1388+ buf = (char *) __get_free_page(GFP_KERNEL);
1389+ if (!buf)
1390+ goto err;
1391+
1392+ old_fs = get_fs();
1393+ set_fs(get_ds());
1394+ /* The cast to a user pointer is valid due to the set_fs() */
1395+ res = inode->i_op->readlink(realdentry,
1396+ (char __user *)buf, PAGE_SIZE - 1);
1397+ set_fs(old_fs);
1398+ if (res < 0) {
1399+ free_page((unsigned long) buf);
1400+ goto err;
1401+ }
1402+ buf[res] = '\0';
1403+
1404+ return buf;
1405+
1406+err:
1407+ return ERR_PTR(res);
1408+}
1409+
1410+static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
1411+{
1412+ struct iattr attr = {
1413+ .ia_valid = ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
1414+ .ia_atime = stat->atime,
1415+ .ia_mtime = stat->mtime,
1416+ };
1417+
1418+ return notify_change(upperdentry, &attr);
1419+}
1420+
1421+static int ovl_set_mode(struct dentry *upperdentry, umode_t mode)
1422+{
1423+ struct iattr attr = {
1424+ .ia_valid = ATTR_MODE,
1425+ .ia_mode = mode,
1426+ };
1427+
1428+ return notify_change(upperdentry, &attr);
1429+}
1430+
1431+static int ovl_set_opaque(struct dentry *upperdentry)
1432+{
1433+ int err;
1434+ const struct cred *old_cred;
1435+ struct cred *override_cred;
1436+
1437+ override_cred = prepare_creds();
1438+ if (!override_cred)
1439+ return -ENOMEM;
1440+
1441+ /* CAP_SYS_ADMIN for setxattr of "trusted" namespace */
1442+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1443+ old_cred = override_creds(override_cred);
1444+ err = vfs_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
1445+ revert_creds(old_cred);
1446+ put_cred(override_cred);
1447+
1448+ return err;
1449+}
1450+
1451+static int ovl_remove_opaque(struct dentry *upperdentry)
1452+{
1453+ int err;
1454+ const struct cred *old_cred;
1455+ struct cred *override_cred;
1456+
1457+ override_cred = prepare_creds();
1458+ if (!override_cred)
1459+ return -ENOMEM;
1460+
1461+ /* CAP_SYS_ADMIN for removexattr of "trusted" namespace */
1462+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1463+ old_cred = override_creds(override_cred);
1464+ err = vfs_removexattr(upperdentry, ovl_opaque_xattr);
1465+ revert_creds(old_cred);
1466+ put_cred(override_cred);
1467+
1468+ return err;
1469+}
1470+
1471+static int ovl_copy_up_locked(struct dentry *upperdir, struct dentry *dentry,
1472+ struct path *lowerpath, struct kstat *stat,
1473+ const char *link)
1474+{
1475+ int err;
1476+ struct path newpath;
1477+ umode_t mode = stat->mode;
1478+ struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
1479+
1480+ /* Can't properly set mode on creation because of the umask */
1481+ stat->mode &= S_IFMT;
1482+
1483+ newpath.mnt = ofs->upper_mnt;
1484+ newpath.dentry = ovl_upper_create(upperdir, dentry, stat, link);
1485+ if (IS_ERR(newpath.dentry)) {
1486+ err = PTR_ERR(newpath.dentry);
1487+
1488+ /* Already copied up? */
1489+ if (err == -EEXIST && ovl_path_type(dentry) != OVL_PATH_LOWER)
1490+ return 0;
1491+
1492+ return err;
1493+ }
1494+
1495+ /* FIXME: recovery from failure to copy up */
1496+
1497+ if (S_ISREG(stat->mode)) {
1498+ err = ovl_copy_up_data(lowerpath, &newpath, stat->size);
1499+ if (err)
1500+ return err;
1501+ }
1502+
1503+ err = ovl_copy_up_xattr(lowerpath->dentry, newpath.dentry);
1504+ if (err)
1505+ return err;
1506+
1507+ mutex_lock(&newpath.dentry->d_inode->i_mutex);
1508+ if (!S_ISLNK(stat->mode))
1509+ err = ovl_set_mode(newpath.dentry, mode);
1510+ if (!err)
1511+ err = ovl_set_timestamps(newpath.dentry, stat);
1512+ mutex_unlock(&newpath.dentry->d_inode->i_mutex);
1513+ if (err)
1514+ return err;
1515+
1516+ ovl_dentry_update(dentry, newpath.dentry);
1517+
1518+ /*
1519+ * Easiest way to get rid of the lower dentry reference is to
1520+ * drop this dentry. This is neither needed nor possible for
1521+ * directories.
1522+ */
1523+ if (!S_ISDIR(stat->mode))
1524+ d_drop(dentry);
1525+
1526+ return 0;
1527+}
1528+
1529+static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
1530+ struct path *lowerpath, struct kstat *stat)
1531+{
1532+ int err;
1533+ struct kstat pstat;
1534+ struct path parentpath;
1535+ struct dentry *upperdir;
1536+ const struct cred *old_cred;
1537+ struct cred *override_cred;
1538+ char *link = NULL;
1539+
1540+ ovl_path_upper(parent, &parentpath);
1541+ upperdir = parentpath.dentry;
1542+
1543+ err = vfs_getattr(parentpath.mnt, parentpath.dentry, &pstat);
1544+ if (err)
1545+ return err;
1546+
1547+ if (S_ISLNK(stat->mode)) {
1548+ link = ovl_read_symlink(lowerpath->dentry);
1549+ if (IS_ERR(link))
1550+ return PTR_ERR(link);
1551+ }
1552+
1553+ err = -ENOMEM;
1554+ override_cred = prepare_creds();
1555+ if (!override_cred)
1556+ goto out_free_link;
1557+
1558+ override_cred->fsuid = stat->uid;
1559+ override_cred->fsgid = stat->gid;
1560+ /*
1561+ * CAP_SYS_ADMIN for copying up extended attributes
1562+ * CAP_DAC_OVERRIDE for create
1563+ * CAP_FOWNER for chmod, timestamp update
1564+ * CAP_FSETID for chmod
1565+ * CAP_MKNOD for mknod
1566+ */
1567+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1568+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
1569+ cap_raise(override_cred->cap_effective, CAP_FOWNER);
1570+ cap_raise(override_cred->cap_effective, CAP_FSETID);
1571+ cap_raise(override_cred->cap_effective, CAP_MKNOD);
1572+ old_cred = override_creds(override_cred);
1573+
1574+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1575+ /*
1576+ * Using upper filesystem locking to protect against copy up
1577+ * racing with rename (rename means the copy up was already
1578+ * successful).
1579+ */
1580+ if (dentry->d_parent != parent) {
1581+ WARN_ON((ovl_path_type(dentry) == OVL_PATH_LOWER));
1582+ err = 0;
1583+ } else {
1584+ err = ovl_copy_up_locked(upperdir, dentry, lowerpath,
1585+ stat, link);
1586+ if (!err) {
1587+ /* Restore timestamps on parent (best effort) */
1588+ ovl_set_timestamps(upperdir, &pstat);
1589+ }
1590+ }
1591+
1592+ mutex_unlock(&upperdir->d_inode->i_mutex);
1593+
1594+ revert_creds(old_cred);
1595+ put_cred(override_cred);
1596+
1597+out_free_link:
1598+ if (link)
1599+ free_page((unsigned long) link);
1600+
1601+ return err;
1602+}
1603+
1604+static int ovl_copy_up(struct dentry *dentry)
1605+{
1606+ int err;
1607+
1608+ err = 0;
1609+ while (!err) {
1610+ struct dentry *next;
1611+ struct dentry *parent;
1612+ struct path lowerpath;
1613+ struct kstat stat;
1614+ enum ovl_path_type type = ovl_path_type(dentry);
1615+
1616+ if (type != OVL_PATH_LOWER)
1617+ break;
1618+
1619+ next = dget(dentry);
1620+ /* find the topmost dentry not yet copied up */
1621+ for (;;) {
1622+ parent = dget_parent(next);
1623+
1624+ type = ovl_path_type(parent);
1625+ if (type != OVL_PATH_LOWER)
1626+ break;
1627+
1628+ dput(next);
1629+ next = parent;
1630+ }
1631+
1632+ ovl_path_lower(next, &lowerpath);
1633+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
1634+ if (!err)
1635+ err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
1636+
1637+ dput(parent);
1638+ dput(next);
1639+ }
1640+
1641+ return err;
1642+}
1643+
1644+/* Optimize by not copying up the file first and truncating later */
1645+static int ovl_copy_up_truncate(struct dentry *dentry, loff_t size)
1646+{
1647+ int err;
1648+ struct kstat stat;
1649+ struct path lowerpath;
1650+ struct dentry *parent = dget_parent(dentry);
1651+
1652+ err = ovl_copy_up(parent);
1653+ if (err)
1654+ goto out_dput_parent;
1655+
1656+ ovl_path_lower(dentry, &lowerpath);
1657+ err = vfs_getattr(lowerpath.mnt, lowerpath.dentry, &stat);
1658+ if (err)
1659+ goto out_dput_parent;
1660+
1661+ if (size < stat.size)
1662+ stat.size = size;
1663+
1664+ err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
1665+
1666+out_dput_parent:
1667+ dput(parent);
1668+ return err;
1669+}
1670+
1671+static int ovl_setattr(struct dentry *dentry, struct iattr *attr)
1672+{
1673+ struct dentry *upperdentry;
1674+ int err;
1675+
1676+ if ((attr->ia_valid & ATTR_SIZE) && !ovl_dentry_upper(dentry))
1677+ err = ovl_copy_up_truncate(dentry, attr->ia_size);
1678+ else
1679+ err = ovl_copy_up(dentry);
1680+ if (err)
1681+ return err;
1682+
1683+ upperdentry = ovl_dentry_upper(dentry);
1684+
1685+ if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
1686+ attr->ia_valid &= ~ATTR_MODE;
1687+
1688+ mutex_lock(&upperdentry->d_inode->i_mutex);
1689+ err = notify_change(upperdentry, attr);
1690+ mutex_unlock(&upperdentry->d_inode->i_mutex);
1691+
1692+ return err;
1693+}
1694+
1695+static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
1696+ struct kstat *stat)
1697+{
1698+ struct path realpath;
1699+
1700+ ovl_path_real(dentry, &realpath);
1701+ return vfs_getattr(realpath.mnt, realpath.dentry, stat);
1702+}
1703+
1704+static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
1705+ struct kstat *stat)
1706+{
1707+ int err;
1708+ enum ovl_path_type type;
1709+ struct path realpath;
1710+
1711+ type = ovl_path_real(dentry, &realpath);
1712+ err = vfs_getattr(realpath.mnt, realpath.dentry, stat);
1713+ if (err)
1714+ return err;
1715+
1716+ stat->dev = dentry->d_sb->s_dev;
1717+ stat->ino = dentry->d_inode->i_ino;
1718+
1719+ /*
1720+ * It's probably not worth it to count subdirs to get the
1721+ * correct link count. nlink=1 seems to pacify 'find' and
1722+ * other utilities.
1723+ */
1724+ if (type == OVL_PATH_MERGE)
1725+ stat->nlink = 1;
1726+
1727+ return 0;
1728+}
1729+
1730+static int ovl_permission(struct inode *inode, int mask, unsigned int flags)
1731+{
1732+ struct ovl_entry *oe;
1733+ struct dentry *alias = NULL;
1734+ struct inode *realinode;
1735+ struct dentry *realdentry;
1736+ bool is_upper;
1737+ int err;
1738+
1739+ if (S_ISDIR(inode->i_mode)) {
1740+ oe = inode->i_private;
1741+ } else if (flags & IPERM_FLAG_RCU) {
1742+ return -ECHILD;
1743+ } else {
1744+ /*
1745+ * For non-directories find an alias and get the info
1746+ * from there.
1747+ */
1748+ spin_lock(&inode->i_lock);
1749+ if (WARN_ON(list_empty(&inode->i_dentry))) {
1750+ spin_unlock(&inode->i_lock);
1751+ return -ENOENT;
1752+ }
1753+ alias = list_entry(inode->i_dentry.next, struct dentry, d_alias);
1754+ dget(alias);
1755+ spin_unlock(&inode->i_lock);
1756+ oe = alias->d_fsdata;
1757+ }
1758+
1759+ realdentry = ovl_upperdentry_dereference(oe);
1760+ is_upper = true;
1761+ if (!realdentry) {
1762+ realdentry = oe->lowerdentry;
1763+ is_upper = false;
1764+ }
1765+
1766+ /* Careful in RCU walk mode */
1767+ realinode = ACCESS_ONCE(realdentry->d_inode);
1768+ if (!realinode) {
1769+ WARN_ON(!(flags & IPERM_FLAG_RCU));
1770+ return -ENOENT;
1771+ }
1772+
1773+ if (mask & MAY_WRITE) {
1774+ umode_t mode = realinode->i_mode;
1775+
1776+ /*
1777+ * Writes will always be redirected to upper layer, so
1778+ * ignore lower layer being read-only.
1779+ */
1780+ err = -EROFS;
1781+ if (is_upper && IS_RDONLY(realinode) &&
1782+ (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
1783+ goto out_dput;
1784+
1785+ /*
1786+ * Nobody gets write access to an immutable file.
1787+ */
1788+ err = -EACCES;
1789+ if (IS_IMMUTABLE(realinode))
1790+ goto out_dput;
1791+ }
1792+
1793+ if (realinode->i_op->permission)
1794+ err = realinode->i_op->permission(realinode, mask, flags);
1795+ else
1796+ err = generic_permission(realinode, mask, flags,
1797+ realinode->i_op->check_acl);
1798+out_dput:
1799+ dput(alias);
1800+ return err;
1801+}
1802+
1803+static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
1804+ const char *link)
1805+{
1806+ int err;
1807+ struct dentry *newdentry;
1808+ struct dentry *upperdir;
1809+ struct inode *inode;
1810+ struct kstat stat = {
1811+ .mode = mode,
1812+ .rdev = rdev,
1813+ };
1814+
1815+ err = -ENOMEM;
1816+ inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
1817+ if (!inode)
1818+ goto out;
1819+
1820+ err = ovl_copy_up(dentry->d_parent);
1821+ if (err)
1822+ goto out_iput;
1823+
1824+ upperdir = ovl_dentry_upper(dentry->d_parent);
1825+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
1826+
1827+ newdentry = ovl_upper_create(upperdir, dentry, &stat, link);
1828+ err = PTR_ERR(newdentry);
1829+ if (IS_ERR(newdentry))
1830+ goto out_unlock;
1831+
1832+ ovl_dentry_version_inc(dentry->d_parent);
1833+ if (ovl_dentry_is_opaque(dentry) && S_ISDIR(mode)) {
1834+ err = ovl_set_opaque(newdentry);
1835+ if (err)
1836+ goto out_dput;
1837+ }
1838+ ovl_dentry_update(dentry, newdentry);
1839+ d_instantiate(dentry, inode);
1840+ inode = NULL;
1841+ newdentry = NULL;
1842+ err = 0;
1843+
1844+out_dput:
1845+ dput(newdentry);
1846+out_unlock:
1847+ mutex_unlock(&upperdir->d_inode->i_mutex);
1848+out_iput:
1849+ iput(inode);
1850+out:
1851+ return err;
1852+}
1853+
1854+static int ovl_create(struct inode *dir, struct dentry *dentry, int mode,
1855+ struct nameidata *nd)
1856+{
1857+ return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
1858+}
1859+
1860+static int ovl_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1861+{
1862+ return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
1863+}
1864+
1865+static int ovl_mknod(struct inode *dir, struct dentry *dentry, int mode,
1866+ dev_t rdev)
1867+{
1868+ return ovl_create_object(dentry, mode, rdev, NULL);
1869+}
1870+
1871+static int ovl_symlink(struct inode *dir, struct dentry *dentry,
1872+ const char *link)
1873+{
1874+ return ovl_create_object(dentry, S_IFLNK, 0, link);
1875+}
1876+
1877+struct ovl_link_data {
1878+ struct dentry *realdentry;
1879+ void *cookie;
1880+};
1881+
1882+static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
1883+{
1884+ void *ret;
1885+ struct dentry *realdentry;
1886+ struct inode *realinode;
1887+
1888+ realdentry = ovl_dentry_real(dentry);
1889+ realinode = realdentry->d_inode;
1890+
1891+ if (WARN_ON(!realinode->i_op->follow_link))
1892+ return ERR_PTR(-EPERM);
1893+
1894+ ret = realinode->i_op->follow_link(realdentry, nd);
1895+ if (IS_ERR(ret))
1896+ return ret;
1897+
1898+ if (realinode->i_op->put_link) {
1899+ struct ovl_link_data *data;
1900+
1901+ data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
1902+ if (!data) {
1903+ realinode->i_op->put_link(realdentry, nd, ret);
1904+ return ERR_PTR(-ENOMEM);
1905+ }
1906+ data->realdentry = realdentry;
1907+ data->cookie = ret;
1908+
1909+ return data;
1910+ } else {
1911+ return NULL;
1912+ }
1913+}
1914+
1915+static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
1916+{
1917+ struct inode *realinode;
1918+ struct ovl_link_data *data = c;
1919+
1920+ if (!data)
1921+ return;
1922+
1923+ realinode = data->realdentry->d_inode;
1924+ realinode->i_op->put_link(data->realdentry, nd, data->cookie);
1925+ kfree(data);
1926+}
1927+
1928+static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
1929+{
1930+ struct path realpath;
1931+ struct inode *realinode;
1932+
1933+ ovl_path_real(dentry, &realpath);
1934+ realinode = realpath.dentry->d_inode;
1935+
1936+ if (!realinode->i_op->readlink)
1937+ return -EINVAL;
1938+
1939+ touch_atime(realpath.mnt, realpath.dentry);
1940+
1941+ return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
1942+}
1943+
1944+static int ovl_whiteout(struct dentry *upperdir, struct dentry *dentry)
1945+{
1946+ int err;
1947+ struct dentry *newdentry;
1948+ const struct cred *old_cred;
1949+ struct cred *override_cred;
1950+
1951+ /* FIXME: recheck lower dentry to see if whiteout is really needed */
1952+
1953+ err = -ENOMEM;
1954+ override_cred = prepare_creds();
1955+ if (!override_cred)
1956+ goto out;
1957+
1958+ /*
1959+ * CAP_SYS_ADMIN for setxattr
1960+ * CAP_DAC_OVERRIDE for symlink creation
1961+ */
1962+ cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
1963+ cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
1964+ override_cred->fsuid = 0;
1965+ override_cred->fsgid = 0;
1966+ old_cred = override_creds(override_cred);
1967+
1968+ newdentry = lookup_one_len(dentry->d_name.name, upperdir,
1969+ dentry->d_name.len);
1970+ err = PTR_ERR(newdentry);
1971+ if (IS_ERR(newdentry))
1972+ goto out_put_cred;
1973+
1974+ /* Just been removed within the same locked region */
1975+ WARN_ON(newdentry->d_inode);
1976+
1977+ err = vfs_symlink(upperdir->d_inode, newdentry, ovl_whiteout_symlink);
1978+ if (err)
1979+ goto out_dput;
1980+
1981+ ovl_dentry_version_inc(dentry->d_parent);
1982+
1983+ err = vfs_setxattr(newdentry, ovl_whiteout_xattr, "y", 1, 0);
1984+
1985+out_dput:
1986+ dput(newdentry);
1987+out_put_cred:
1988+ revert_creds(old_cred);
1989+ put_cred(override_cred);
1990+out:
1991+ return err;
1992+}
1993+
1994+static int ovl_do_remove(struct dentry *dentry, bool is_dir)
1995+{
1996+ int err;
1997+ enum ovl_path_type type;
1998+ struct path realpath;
1999+ struct dentry *upperdir;
2000+
2001+ err = ovl_copy_up(dentry->d_parent);
2002+ if (err)
2003+ return err;
2004+
2005+ upperdir = ovl_dentry_upper(dentry->d_parent);
2006+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
2007+ type = ovl_path_real(dentry, &realpath);
2008+ if (type != OVL_PATH_LOWER) {
2009+ err = -ESTALE;
2010+ if (realpath.dentry->d_parent != upperdir)
2011+ goto out_d_drop;
2012+
2013+ if (is_dir)
2014+ err = vfs_rmdir(upperdir->d_inode, realpath.dentry);
2015+ else
2016+ err = vfs_unlink(upperdir->d_inode, realpath.dentry);
2017+ if (err)
2018+ goto out_d_drop;
2019+
2020+ ovl_dentry_version_inc(dentry->d_parent);
2021+ }
2022+
2023+ if (type != OVL_PATH_UPPER || ovl_dentry_is_opaque(dentry))
2024+ err = ovl_whiteout(upperdir, dentry);
2025+
2026+ /*
2027+ * Keeping this dentry hashed would mean having to release
2028+ * upperpath/lowerpath, which could only be done if we are the
2029+ * sole user of this dentry. Too tricky... Just unhash for
2030+ * now.
2031+ */
2032+out_d_drop:
2033+ d_drop(dentry);
2034+ mutex_unlock(&upperdir->d_inode->i_mutex);
2035+
2036+ return err;
2037+}
2038+
2039+static int ovl_unlink(struct inode *dir, struct dentry *dentry)
2040+{
2041+ return ovl_do_remove(dentry, false);
2042+}
2043+
2044+static int ovl_check_empty_dir(struct dentry *dentry)
2045+{
2046+ int err;
2047+ struct path lowerpath;
2048+ struct path upperpath;
2049+ struct ovl_cache_entry *p;
2050+ LIST_HEAD(list);
2051+ struct ovl_readdir_data rdd = { .list = &list };
2052+
2053+ ovl_path_upper(dentry, &upperpath);
2054+ ovl_path_lower(dentry, &lowerpath);
2055+
2056+ err = ovl_dir_read_merged(&upperpath, &lowerpath, &rdd);
2057+ if (err)
2058+ return err;
2059+
2060+ err = 0;
2061+
2062+ list_for_each_entry(p, &list, l_node) {
2063+ if (p->is_whiteout)
2064+ continue;
2065+
2066+ if (p->name[0] == '.') {
2067+ if (p->len == 1)
2068+ continue;
2069+ if (p->len == 2 && p->name[1] == '.')
2070+ continue;
2071+ }
2072+ err = -ENOTEMPTY;
2073+ break;
2074+ }
2075+
2076+ ovl_cache_free(&list);
2077+
2078+ return err;
2079+}
2080+
2081+static int ovl_unlink_whiteout(void *buf, const char *name, int namelen,
2082+ loff_t offset, u64 ino, unsigned int d_type)
2083+{
2084+ struct ovl_readdir_data *rdd = buf;
2085+
2086+ rdd->count++;
2087+ /* check d_type to filter out "." and ".." */
2088+ if (d_type == DT_LNK) {
2089+ struct dentry *dentry;
2090+
2091+ dentry = lookup_one_len(name, rdd->dir, namelen);
2092+ if (IS_ERR(dentry)) {
2093+ rdd->err = PTR_ERR(dentry);
2094+ } else {
2095+ rdd->err = vfs_unlink(rdd->dir->d_inode, dentry);
2096+ dput(dentry);
2097+ }
2098+ }
2099+
2100+ return rdd->err;
2101+}
2102+
2103+static int ovl_remove_whiteouts(struct dentry *dentry)
2104+{
2105+ struct path upperpath;
2106+ struct ovl_readdir_data rdd = { .list = NULL };
2107+
2108+ ovl_path_upper(dentry, &upperpath);
2109+ rdd.dir = upperpath.dentry;
2110+
2111+ return ovl_dir_read(&upperpath, &rdd, ovl_unlink_whiteout);
2112+}
2113+
2114+static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
2115+{
2116+ int err;
2117+ enum ovl_path_type type;
2118+
2119+ type = ovl_path_type(dentry);
2120+ if (type != OVL_PATH_UPPER) {
2121+ err = ovl_check_empty_dir(dentry);
2122+ if (err)
2123+ return err;
2124+
2125+ if (type == OVL_PATH_MERGE) {
2126+ err = ovl_remove_whiteouts(dentry);
2127+ if (err)
2128+ return err;
2129+ }
2130+ }
2131+
2132+ return ovl_do_remove(dentry, true);
2133+}
2134+
2135+static int ovl_link(struct dentry *old, struct inode *newdir,
2136+ struct dentry *new)
2137+{
2138+ int err;
2139+ struct dentry *olddentry;
2140+ struct dentry *newdentry;
2141+ struct dentry *upperdir;
2142+
2143+ err = ovl_copy_up(old);
2144+ if (err)
2145+ goto out;
2146+
2147+ err = ovl_copy_up(new->d_parent);
2148+ if (err)
2149+ goto out;
2150+
2151+ upperdir = ovl_dentry_upper(new->d_parent);
2152+ mutex_lock_nested(&upperdir->d_inode->i_mutex, I_MUTEX_PARENT);
2153+ newdentry = ovl_lookup_create(upperdir, new);
2154+ err = PTR_ERR(newdentry);
2155+ if (IS_ERR(newdentry))
2156+ goto out_unlock;
2157+
2158+ olddentry = ovl_dentry_upper(old);
2159+ err = vfs_link(olddentry, upperdir->d_inode, newdentry);
2160+ if (!err) {
2161+ ovl_dentry_version_inc(new->d_parent);
2162+ ovl_dentry_update(new, newdentry);
2163+
2164+ ihold(old->d_inode);
2165+ d_instantiate(new, old->d_inode);
2166+ } else {
2167+ dput(newdentry);
2168+ }
2169+out_unlock:
2170+ mutex_unlock(&upperdir->d_inode->i_mutex);
2171+out:
2172+ return err;
2173+
2174+}
2175+
2176+static int ovl_rename(struct inode *olddir, struct dentry *old,
2177+ struct inode *newdir, struct dentry *new)
2178+{
2179+ int err;
2180+ enum ovl_path_type old_type;
2181+ struct dentry *old_upperdir;
2182+ struct dentry *new_upperdir;
2183+ struct dentry *olddentry;
2184+ struct dentry *newdentry;
2185+ struct dentry *trap;
2186+ bool is_dir = S_ISDIR(old->d_inode->i_mode);
2187+
2188+ /* Don't copy up directory trees */
2189+ old_type = ovl_path_type(old);
2190+ if (old_type != OVL_PATH_UPPER && is_dir)
2191+ return -EXDEV;
2192+
2193+ if (new->d_inode) {
2194+ enum ovl_path_type new_type;
2195+
2196+ new_type = ovl_path_type(new);
2197+
2198+ if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
2199+ if (ovl_dentry_lower(old)->d_inode ==
2200+ ovl_dentry_lower(new)->d_inode)
2201+ return 0;
2202+ }
2203+ if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
2204+ if (ovl_dentry_upper(old)->d_inode ==
2205+ ovl_dentry_upper(new)->d_inode)
2206+ return 0;
2207+ }
2208+
2209+ if (new_type != OVL_PATH_UPPER &&
2210+ S_ISDIR(new->d_inode->i_mode)) {
2211+ err = ovl_check_empty_dir(new);
2212+ if (err)
2213+ return err;
2214+
2215+ if (new_type == OVL_PATH_MERGE) {
2216+ err = ovl_remove_whiteouts(new);
2217+ if (err)
2218+ return err;
2219+ }
2220+ }
2221+ }
2222+
2223+ err = ovl_copy_up(old);
2224+ if (err)
2225+ return err;
2226+
2227+ err = ovl_copy_up(new->d_parent);
2228+ if (err)
2229+ return err;
2230+
2231+ old_upperdir = ovl_dentry_upper(old->d_parent);
2232+ new_upperdir = ovl_dentry_upper(new->d_parent);
2233+
2234+ trap = lock_rename(new_upperdir, old_upperdir);
2235+
2236+ olddentry = ovl_dentry_upper(old);
2237+ newdentry = ovl_dentry_upper(new);
2238+ if (newdentry) {
2239+ dget(newdentry);
2240+ } else {
2241+ newdentry = ovl_lookup_create(new_upperdir, new);
2242+ err = PTR_ERR(newdentry);
2243+ if (IS_ERR(newdentry))
2244+ goto out_unlock;
2245+ }
2246+
2247+ err = -ESTALE;
2248+ if (olddentry->d_parent != old_upperdir)
2249+ goto out_dput;
2250+ if (newdentry->d_parent != new_upperdir)
2251+ goto out_dput;
2252+ if (olddentry == trap)
2253+ goto out_dput;
2254+ if (newdentry == trap)
2255+ goto out_dput;
2256+
2257+ err = vfs_rename(old_upperdir->d_inode, olddentry,
2258+ new_upperdir->d_inode, newdentry);
2259+
2260+ if (!err) {
2261+ bool old_opaque = ovl_dentry_is_opaque(old);
2262+ bool new_opaque = ovl_dentry_is_opaque(new);
2263+
2264+ if (ovl_path_type(new) != OVL_PATH_UPPER)
2265+ new_opaque = true;
2266+
2267+ if (old_type != OVL_PATH_UPPER || old_opaque)
2268+ err = ovl_whiteout(old_upperdir, old);
2269+ if (!err && is_dir) {
2270+ if (old_opaque && !new_opaque) {
2271+ ovl_remove_opaque(olddentry);
2272+ ovl_dentry_set_opaque(old, false);
2273+ }
2274+ if (!old_opaque && new_opaque) {
2275+ err = ovl_set_opaque(olddentry);
2276+ ovl_dentry_set_opaque(old, true);
2277+ }
2278+ }
2279+ ovl_dentry_version_inc(old->d_parent);
2280+ ovl_dentry_version_inc(new->d_parent);
2281+ }
2282+
2283+out_dput:
2284+ dput(newdentry);
2285+out_unlock:
2286+ unlock_rename(new_upperdir, old_upperdir);
2287+ return err;
2288+}
2289+
2290+static bool ovl_is_private_xattr(const char *name)
2291+{
2292+ return strncmp(name, "trusted.overlay.", 14) == 0;
2293+}
2294+
2295+static int ovl_setxattr(struct dentry *dentry, const char *name,
2296+ const void *value, size_t size, int flags)
2297+{
2298+ int err;
2299+ struct dentry *upperdentry;
2300+
2301+ if (ovl_is_private_xattr(name))
2302+ return -EPERM;
2303+
2304+ err = ovl_copy_up(dentry);
2305+ if (err)
2306+ return err;
2307+
2308+ upperdentry = ovl_dentry_upper(dentry);
2309+ return vfs_setxattr(upperdentry, name, value, size, flags);
2310+}
2311+
2312+static ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
2313+ void *value, size_t size)
2314+{
2315+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
2316+ ovl_is_private_xattr(name))
2317+ return -ENODATA;
2318+
2319+ return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
2320+}
2321+
2322+static ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
2323+{
2324+ ssize_t res;
2325+ int off;
2326+
2327+ res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
2328+ if (res <= 0 || size == 0)
2329+ return res;
2330+
2331+ if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
2332+ return res;
2333+
2334+ /* filter out private xattrs */
2335+ for (off = 0; off < res;) {
2336+ char *s = list + off;
2337+ size_t slen = strlen(s) + 1;
2338+
2339+ BUG_ON(off + slen > res);
2340+
2341+ if (ovl_is_private_xattr(s)) {
2342+ res -= slen;
2343+ memmove(s, s + slen, res - off);
2344+ } else {
2345+ off += slen;
2346+ }
2347+ }
2348+
2349+ return res;
2350+}
2351+
2352+static int ovl_removexattr(struct dentry *dentry, const char *name)
2353+{
2354+ int err;
2355+ struct path realpath;
2356+ enum ovl_path_type type;
2357+
2358+ if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
2359+ ovl_is_private_xattr(name))
2360+ return -ENODATA;
2361+
2362+ type = ovl_path_real(dentry, &realpath);
2363+ if (type == OVL_PATH_LOWER) {
2364+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
2365+ if (err < 0)
2366+ return err;
2367+
2368+ err = ovl_copy_up(dentry);
2369+ if (err)
2370+ return err;
2371+
2372+ ovl_path_upper(dentry, &realpath);
2373+ }
2374+
2375+ return vfs_removexattr(realpath.dentry, name);
2376+}
2377+
2378+static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
2379+ struct dentry *realdentry)
2380+{
2381+ if (type != OVL_PATH_LOWER)
2382+ return false;
2383+
2384+ if (special_file(realdentry->d_inode->i_mode))
2385+ return false;
2386+
2387+ if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
2388+ return false;
2389+
2390+ return true;
2391+}
2392+
2393+static struct file *ovl_open(struct dentry *dentry, int flags,
2394+ const struct cred *cred)
2395+{
2396+ int err;
2397+ struct path realpath;
2398+ enum ovl_path_type type;
2399+
2400+ type = ovl_path_real(dentry, &realpath);
2401+ if (ovl_open_need_copy_up(flags, type, realpath.dentry)) {
2402+ if (flags & O_TRUNC)
2403+ err = ovl_copy_up_truncate(dentry, 0);
2404+ else
2405+ err = ovl_copy_up(dentry);
2406+ if (err)
2407+ return ERR_PTR(err);
2408+
2409+ ovl_path_upper(dentry, &realpath);
2410+ }
2411+
2412+ return vfs_open(&realpath, flags, cred);
2413+}
2414+
2415+static const struct inode_operations ovl_dir_inode_operations = {
2416+ .lookup = ovl_lookup,
2417+ .mkdir = ovl_mkdir,
2418+ .symlink = ovl_symlink,
2419+ .unlink = ovl_unlink,
2420+ .rmdir = ovl_rmdir,
2421+ .rename = ovl_rename,
2422+ .link = ovl_link,
2423+ .setattr = ovl_setattr,
2424+ .create = ovl_create,
2425+ .mknod = ovl_mknod,
2426+ .permission = ovl_permission,
2427+ .getattr = ovl_dir_getattr,
2428+ .setxattr = ovl_setxattr,
2429+ .getxattr = ovl_getxattr,
2430+ .listxattr = ovl_listxattr,
2431+ .removexattr = ovl_removexattr,
2432+};
2433+
2434+static const struct inode_operations ovl_file_inode_operations = {
2435+ .setattr = ovl_setattr,
2436+ .permission = ovl_permission,
2437+ .getattr = ovl_getattr,
2438+ .setxattr = ovl_setxattr,
2439+ .getxattr = ovl_getxattr,
2440+ .listxattr = ovl_listxattr,
2441+ .removexattr = ovl_removexattr,
2442+ .open = ovl_open,
2443+};
2444+
2445+static const struct inode_operations ovl_symlink_inode_operations = {
2446+ .setattr = ovl_setattr,
2447+ .follow_link = ovl_follow_link,
2448+ .put_link = ovl_put_link,
2449+ .readlink = ovl_readlink,
2450+ .getattr = ovl_getattr,
2451+ .setxattr = ovl_setxattr,
2452+ .getxattr = ovl_getxattr,
2453+ .listxattr = ovl_listxattr,
2454+ .removexattr = ovl_removexattr,
2455+};
2456+
2457+static struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
2458+ struct ovl_entry *oe)
2459+{
2460+ struct inode *inode;
2461+
2462+ inode = new_inode(sb);
2463+ if (!inode)
2464+ return NULL;
2465+
2466+ mode &= S_IFMT;
2467+
2468+ inode->i_ino = get_next_ino();
2469+ inode->i_mode = mode;
2470+ inode->i_flags |= S_NOATIME | S_NOCMTIME;
2471+
2472+ switch (mode) {
2473+ case S_IFDIR:
2474+ inode->i_private = oe;
2475+ inode->i_op = &ovl_dir_inode_operations;
2476+ inode->i_fop = &ovl_dir_operations;
2477+ break;
2478+
2479+ case S_IFLNK:
2480+ inode->i_op = &ovl_symlink_inode_operations;
2481+ break;
2482+
2483+ case S_IFREG:
2484+ case S_IFSOCK:
2485+ case S_IFBLK:
2486+ case S_IFCHR:
2487+ case S_IFIFO:
2488+ inode->i_op = &ovl_file_inode_operations;
2489+ break;
2490+
2491+ default:
2492+ WARN(1, "illegal file type: %i\n", mode);
2493+ inode = NULL;
2494+ }
2495+
2496+ return inode;
2497+
2498+}
2499+
2500+static void ovl_put_super(struct super_block *sb)
2501+{
2502+ struct ovl_fs *ufs = sb->s_fs_info;
2503+
2504+ if (!(sb->s_flags & MS_RDONLY))
2505+ mnt_drop_write(ufs->upper_mnt);
2506+
2507+ mntput(ufs->upper_mnt);
2508+ mntput(ufs->lower_mnt);
2509+
2510+ kfree(ufs);
2511+}
2512+
2513+static int ovl_remount_fs(struct super_block *sb, int *flagsp, char *data)
2514+{
2515+ int flags = *flagsp;
2516+ struct ovl_fs *ufs = sb->s_fs_info;
2517+
2518+ /* When remounting rw or ro, we need to adjust the write access to the
2519+ * upper fs.
2520+ */
2521+ if (((flags ^ sb->s_flags) & MS_RDONLY) == 0)
2522+ /* No change to readonly status */
2523+ return 0;
2524+
2525+ if (flags & MS_RDONLY) {
2526+ mnt_drop_write(ufs->upper_mnt);
2527+ return 0;
2528+ } else
2529+ return mnt_want_write(ufs->upper_mnt);
2530+}
2531+
2532+/**
2533+ * ovl_statfs
2534+ * @sb: The overlayfs super block
2535+ * @buf: The struct kstatfs to fill in with stats
2536+ *
2537+ * Get the filesystem statistics. As writes always target the upper layer
2538+ * filesystem pass the statfs to the same filesystem.
2539+ */
2540+static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
2541+{
2542+ struct dentry *root_dentry = dentry->d_sb->s_root;
2543+ struct path path;
2544+ ovl_path_upper(root_dentry, &path);
2545+
2546+ if (!path.dentry->d_sb->s_op->statfs)
2547+ return -ENOSYS;
2548+ return path.dentry->d_sb->s_op->statfs(path.dentry, buf);
2549+}
2550+
2551+static const struct super_operations ovl_super_operations = {
2552+ .put_super = ovl_put_super,
2553+ .remount_fs = ovl_remount_fs,
2554+ .statfs = ovl_statfs,
2555+};
2556+
2557+struct ovl_config {
2558+ char *lowerdir;
2559+ char *upperdir;
2560+};
2561+
2562+enum {
2563+ Opt_lowerdir,
2564+ Opt_upperdir,
2565+ Opt_err,
2566+};
2567+
2568+static const match_table_t ovl_tokens = {
2569+ {Opt_lowerdir, "lowerdir=%s"},
2570+ {Opt_upperdir, "upperdir=%s"},
2571+ {Opt_err, NULL}
2572+};
2573+
2574+static int ovl_parse_opt(char *opt, struct ovl_config *config)
2575+{
2576+ char *p;
2577+
2578+ config->upperdir = NULL;
2579+ config->lowerdir = NULL;
2580+
2581+ while ((p = strsep(&opt, ",")) != NULL) {
2582+ int token;
2583+ substring_t args[MAX_OPT_ARGS];
2584+
2585+ if (!*p)
2586+ continue;
2587+
2588+ token = match_token(p, ovl_tokens, args);
2589+ switch (token) {
2590+ case Opt_upperdir:
2591+ kfree(config->upperdir);
2592+ config->upperdir = match_strdup(&args[0]);
2593+ if (!config->upperdir)
2594+ return -ENOMEM;
2595+ break;
2596+
2597+ case Opt_lowerdir:
2598+ kfree(config->lowerdir);
2599+ config->lowerdir = match_strdup(&args[0]);
2600+ if (!config->lowerdir)
2601+ return -ENOMEM;
2602+ break;
2603+
2604+ default:
2605+ return -EINVAL;
2606+ }
2607+ }
2608+ return 0;
2609+}
2610+
2611+static int ovl_fill_super(struct super_block *sb, void *data, int silent)
2612+{
2613+ struct path lowerpath;
2614+ struct path upperpath;
2615+ struct inode *root_inode;
2616+ struct dentry *root_dentry;
2617+ struct ovl_entry *oe;
2618+ struct ovl_fs *ufs;
2619+ struct ovl_config config;
2620+ int err;
2621+
2622+ err = ovl_parse_opt((char *) data, &config);
2623+ if (err)
2624+ goto out;
2625+
2626+ err = -EINVAL;
2627+ if (!config.upperdir || !config.lowerdir) {
2628+ printk(KERN_ERR "overlayfs: missing upperdir or lowerdir\n");
2629+ goto out_free_config;
2630+ }
2631+
2632+ err = -ENOMEM;
2633+ ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL);
2634+ if (!ufs)
2635+ goto out_free_config;
2636+
2637+ oe = ovl_alloc_entry();
2638+ if (oe == NULL)
2639+ goto out_free_ufs;
2640+
2641+ root_inode = ovl_new_inode(sb, S_IFDIR, oe);
2642+ if (!root_inode)
2643+ goto out_free_oe;
2644+
2645+ err = kern_path(config.upperdir, LOOKUP_FOLLOW, &upperpath);
2646+ if (err)
2647+ goto out_put_root;
2648+
2649+ err = kern_path(config.lowerdir, LOOKUP_FOLLOW, &lowerpath);
2650+ if (err)
2651+ goto out_put_upperpath;
2652+
2653+ err = -ENOTDIR;
2654+ if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
2655+ !S_ISDIR(lowerpath.dentry->d_inode->i_mode))
2656+ goto out_put_lowerpath;
2657+
2658+ ufs->upper_mnt = clone_private_mount(&upperpath);
2659+ err = PTR_ERR(ufs->upper_mnt);
2660+ if (IS_ERR(ufs->upper_mnt)) {
2661+ printk(KERN_ERR "overlayfs: failed to clone upperpath\n");
2662+ goto out_put_lowerpath;
2663+ }
2664+
2665+ ufs->lower_mnt = clone_private_mount(&lowerpath);
2666+ err = PTR_ERR(ufs->lower_mnt);
2667+ if (IS_ERR(ufs->lower_mnt)) {
2668+ printk(KERN_ERR "overlayfs: failed to clone lowerpath\n");
2669+ goto out_put_upper_mnt;
2670+ }
2671+
2672+ if (!(sb->s_flags & MS_RDONLY)) {
2673+ err = mnt_want_write(ufs->upper_mnt);
2674+ if (err)
2675+ goto out_put_lower_mnt;
2676+ }
2677+
2678+ err = -ENOMEM;
2679+ root_dentry = d_alloc_root(root_inode);
2680+ if (!root_dentry)
2681+ goto out_drop_write;
2682+
2683+ mntput(upperpath.mnt);
2684+ mntput(lowerpath.mnt);
2685+
2686+ oe->__upperdentry = upperpath.dentry;
2687+ oe->lowerdentry = lowerpath.dentry;
2688+
2689+ root_dentry->d_fsdata = oe;
2690+ root_dentry->d_op = &ovl_dentry_operations;
2691+
2692+ sb->s_op = &ovl_super_operations;
2693+ sb->s_root = root_dentry;
2694+ sb->s_fs_info = ufs;
2695+
2696+ return 0;
2697+
2698+out_drop_write:
2699+ if (!(sb->s_flags & MS_RDONLY))
2700+ mnt_drop_write(ufs->upper_mnt);
2701+out_put_lower_mnt:
2702+ mntput(ufs->lower_mnt);
2703+out_put_upper_mnt:
2704+ mntput(ufs->upper_mnt);
2705+out_put_lowerpath:
2706+ path_put(&lowerpath);
2707+out_put_upperpath:
2708+ path_put(&upperpath);
2709+out_put_root:
2710+ iput(root_inode);
2711+out_free_oe:
2712+ kfree(oe);
2713+out_free_ufs:
2714+ kfree(ufs);
2715+out_free_config:
2716+ kfree(config.lowerdir);
2717+ kfree(config.upperdir);
2718+out:
2719+ return err;
2720+}
2721+
2722+static int ovl_get_sb(struct file_system_type *fs_type,
2723+ int flags, const char *dev_name,
2724+ void *raw_data, struct vfsmount *mnt)
2725+{
2726+ return get_sb_nodev(fs_type, flags, raw_data, ovl_fill_super, mnt);
2727+}
2728+
2729+static struct file_system_type ovl_fs_type = {
2730+ .owner = THIS_MODULE,
2731+ .name = "overlayfs",
2732+ .get_sb = ovl_get_sb,
2733+ .kill_sb = kill_anon_super,
2734+};
2735+
2736+static int __init ovl_init(void)
2737+{
2738+ return register_filesystem(&ovl_fs_type);
2739+}
2740+
2741+static void __exit ovl_exit(void)
2742+{
2743+ unregister_filesystem(&ovl_fs_type);
2744+}
2745+
2746+module_init(ovl_init);
2747+module_exit(ovl_exit);
2748--- a/fs/splice.c
2749+++ b/fs/splice.c
2750@@ -1296,6 +1296,7 @@ long do_splice_direct(struct file *in, l
2751 
2752     return ret;
2753 }
2754+EXPORT_SYMBOL(do_splice_direct);
2755 
2756 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
2757                    struct pipe_inode_info *opipe,
2758--- a/include/linux/fs.h
2759+++ b/include/linux/fs.h
2760@@ -1587,6 +1587,7 @@ struct inode_operations {
2761     void (*truncate_range)(struct inode *, loff_t, loff_t);
2762     int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
2763               u64 len);
2764+ struct file *(*open)(struct dentry *, int flags, const struct cred *);
2765 } ____cacheline_aligned;
2766 
2767 struct seq_file;
2768@@ -1990,6 +1991,7 @@ extern int do_fallocate(struct file *fil
2769 extern long do_sys_open(int dfd, const char __user *filename, int flags,
2770             int mode);
2771 extern struct file *filp_open(const char *, int, int);
2772+extern struct file *vfs_open(struct path *, int flags, const struct cred *);
2773 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
2774                  const struct cred *);
2775 extern int filp_close(struct file *, fl_owner_t id);
2776--- a/include/linux/mount.h
2777+++ b/include/linux/mount.h
2778@@ -100,6 +100,9 @@ extern void mnt_pin(struct vfsmount *mnt
2779 extern void mnt_unpin(struct vfsmount *mnt);
2780 extern int __mnt_is_readonly(struct vfsmount *mnt);
2781 
2782+struct path;
2783+extern struct vfsmount *clone_private_mount(struct path *path);
2784+
2785 extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
2786                       const char *name, void *data);
2787 
2788

Archive Download this file



interactive