Root/fs/ntfs/file.c

1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 *
4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc.
5 *
6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published
8 * by the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program/include file is distributed in the hope that it will be
12 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
13 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program (in the main directory of the Linux-NTFS
18 * distribution in the file COPYING); if not, write to the Free Software
19 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22#include <linux/buffer_head.h>
23#include <linux/gfp.h>
24#include <linux/pagemap.h>
25#include <linux/pagevec.h>
26#include <linux/sched.h>
27#include <linux/swap.h>
28#include <linux/uio.h>
29#include <linux/writeback.h>
30
31#include <asm/page.h>
32#include <asm/uaccess.h>
33
34#include "attrib.h"
35#include "bitmap.h"
36#include "inode.h"
37#include "debug.h"
38#include "lcnalloc.h"
39#include "malloc.h"
40#include "mft.h"
41#include "ntfs.h"
42
43/**
44 * ntfs_file_open - called when an inode is about to be opened
45 * @vi: inode to be opened
46 * @filp: file structure describing the inode
47 *
48 * Limit file size to the page cache limit on architectures where unsigned long
49 * is 32-bits. This is the most we can do for now without overflowing the page
50 * cache page index. Doing it this way means we don't run into problems because
51 * of existing too large files. It would be better to allow the user to read
52 * the beginning of the file but I doubt very much anyone is going to hit this
53 * check on a 32-bit architecture, so there is no point in adding the extra
54 * complexity required to support this.
55 *
56 * On 64-bit architectures, the check is hopefully optimized away by the
57 * compiler.
58 *
59 * After the check passes, just call generic_file_open() to do its work.
60 */
61static int ntfs_file_open(struct inode *vi, struct file *filp)
62{
63    if (sizeof(unsigned long) < 8) {
64        if (i_size_read(vi) > MAX_LFS_FILESIZE)
65            return -EOVERFLOW;
66    }
67    return generic_file_open(vi, filp);
68}
69
70#ifdef NTFS_RW
71
72/**
73 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
74 * @ni: ntfs inode of the attribute to extend
75 * @new_init_size: requested new initialized size in bytes
76 * @cached_page: store any allocated but unused page here
77 * @lru_pvec: lru-buffering pagevec of the caller
78 *
79 * Extend the initialized size of an attribute described by the ntfs inode @ni
80 * to @new_init_size bytes. This involves zeroing any non-sparse space between
81 * the old initialized size and @new_init_size both in the page cache and on
82 * disk (if relevant complete pages are already uptodate in the page cache then
83 * these are simply marked dirty).
84 *
85 * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
86 * in the resident attribute case, it is tied to the initialized size and, in
87 * the non-resident attribute case, it may not fall below the initialized size.
88 *
89 * Note that if the attribute is resident, we do not need to touch the page
90 * cache at all. This is because if the page cache page is not uptodate we
91 * bring it uptodate later, when doing the write to the mft record since we
92 * then already have the page mapped. And if the page is uptodate, the
93 * non-initialized region will already have been zeroed when the page was
94 * brought uptodate and the region may in fact already have been overwritten
95 * with new data via mmap() based writes, so we cannot just zero it. And since
96 * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
97 * is unspecified, we choose not to do zeroing and thus we do not need to touch
98 * the page at all. For a more detailed explanation see ntfs_truncate() in
99 * fs/ntfs/inode.c.
100 *
101 * Return 0 on success and -errno on error. In the case that an error is
102 * encountered it is possible that the initialized size will already have been
103 * incremented some way towards @new_init_size but it is guaranteed that if
104 * this is the case, the necessary zeroing will also have happened and that all
105 * metadata is self-consistent.
106 *
107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
108 * held by the caller.
109 */
110static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
111{
112    s64 old_init_size;
113    loff_t old_i_size;
114    pgoff_t index, end_index;
115    unsigned long flags;
116    struct inode *vi = VFS_I(ni);
117    ntfs_inode *base_ni;
118    MFT_RECORD *m = NULL;
119    ATTR_RECORD *a;
120    ntfs_attr_search_ctx *ctx = NULL;
121    struct address_space *mapping;
122    struct page *page = NULL;
123    u8 *kattr;
124    int err;
125    u32 attr_len;
126
127    read_lock_irqsave(&ni->size_lock, flags);
128    old_init_size = ni->initialized_size;
129    old_i_size = i_size_read(vi);
130    BUG_ON(new_init_size > ni->allocated_size);
131    read_unlock_irqrestore(&ni->size_lock, flags);
132    ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
133            "old_initialized_size 0x%llx, "
134            "new_initialized_size 0x%llx, i_size 0x%llx.",
135            vi->i_ino, (unsigned)le32_to_cpu(ni->type),
136            (unsigned long long)old_init_size,
137            (unsigned long long)new_init_size, old_i_size);
138    if (!NInoAttr(ni))
139        base_ni = ni;
140    else
141        base_ni = ni->ext.base_ntfs_ino;
142    /* Use goto to reduce indentation and we need the label below anyway. */
143    if (NInoNonResident(ni))
144        goto do_non_resident_extend;
145    BUG_ON(old_init_size != old_i_size);
146    m = map_mft_record(base_ni);
147    if (IS_ERR(m)) {
148        err = PTR_ERR(m);
149        m = NULL;
150        goto err_out;
151    }
152    ctx = ntfs_attr_get_search_ctx(base_ni, m);
153    if (unlikely(!ctx)) {
154        err = -ENOMEM;
155        goto err_out;
156    }
157    err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
158            CASE_SENSITIVE, 0, NULL, 0, ctx);
159    if (unlikely(err)) {
160        if (err == -ENOENT)
161            err = -EIO;
162        goto err_out;
163    }
164    m = ctx->mrec;
165    a = ctx->attr;
166    BUG_ON(a->non_resident);
167    /* The total length of the attribute value. */
168    attr_len = le32_to_cpu(a->data.resident.value_length);
169    BUG_ON(old_i_size != (loff_t)attr_len);
170    /*
171     * Do the zeroing in the mft record and update the attribute size in
172     * the mft record.
173     */
174    kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
175    memset(kattr + attr_len, 0, new_init_size - attr_len);
176    a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
177    /* Finally, update the sizes in the vfs and ntfs inodes. */
178    write_lock_irqsave(&ni->size_lock, flags);
179    i_size_write(vi, new_init_size);
180    ni->initialized_size = new_init_size;
181    write_unlock_irqrestore(&ni->size_lock, flags);
182    goto done;
183do_non_resident_extend:
184    /*
185     * If the new initialized size @new_init_size exceeds the current file
186     * size (vfs inode->i_size), we need to extend the file size to the
187     * new initialized size.
188     */
189    if (new_init_size > old_i_size) {
190        m = map_mft_record(base_ni);
191        if (IS_ERR(m)) {
192            err = PTR_ERR(m);
193            m = NULL;
194            goto err_out;
195        }
196        ctx = ntfs_attr_get_search_ctx(base_ni, m);
197        if (unlikely(!ctx)) {
198            err = -ENOMEM;
199            goto err_out;
200        }
201        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
202                CASE_SENSITIVE, 0, NULL, 0, ctx);
203        if (unlikely(err)) {
204            if (err == -ENOENT)
205                err = -EIO;
206            goto err_out;
207        }
208        m = ctx->mrec;
209        a = ctx->attr;
210        BUG_ON(!a->non_resident);
211        BUG_ON(old_i_size != (loff_t)
212                sle64_to_cpu(a->data.non_resident.data_size));
213        a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
214        flush_dcache_mft_record_page(ctx->ntfs_ino);
215        mark_mft_record_dirty(ctx->ntfs_ino);
216        /* Update the file size in the vfs inode. */
217        i_size_write(vi, new_init_size);
218        ntfs_attr_put_search_ctx(ctx);
219        ctx = NULL;
220        unmap_mft_record(base_ni);
221        m = NULL;
222    }
223    mapping = vi->i_mapping;
224    index = old_init_size >> PAGE_CACHE_SHIFT;
225    end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
226    do {
227        /*
228         * Read the page. If the page is not present, this will zero
229         * the uninitialized regions for us.
230         */
231        page = read_mapping_page(mapping, index, NULL);
232        if (IS_ERR(page)) {
233            err = PTR_ERR(page);
234            goto init_err_out;
235        }
236        if (unlikely(PageError(page))) {
237            page_cache_release(page);
238            err = -EIO;
239            goto init_err_out;
240        }
241        /*
242         * Update the initialized size in the ntfs inode. This is
243         * enough to make ntfs_writepage() work.
244         */
245        write_lock_irqsave(&ni->size_lock, flags);
246        ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
247        if (ni->initialized_size > new_init_size)
248            ni->initialized_size = new_init_size;
249        write_unlock_irqrestore(&ni->size_lock, flags);
250        /* Set the page dirty so it gets written out. */
251        set_page_dirty(page);
252        page_cache_release(page);
253        /*
254         * Play nice with the vm and the rest of the system. This is
255         * very much needed as we can potentially be modifying the
256         * initialised size from a very small value to a really huge
257         * value, e.g.
258         * f = open(somefile, O_TRUNC);
259         * truncate(f, 10GiB);
260         * seek(f, 10GiB);
261         * write(f, 1);
262         * And this would mean we would be marking dirty hundreds of
263         * thousands of pages or as in the above example more than
264         * two and a half million pages!
265         *
266         * TODO: For sparse pages could optimize this workload by using
267         * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
268         * would be set in readpage for sparse pages and here we would
269         * not need to mark dirty any pages which have this bit set.
270         * The only caveat is that we have to clear the bit everywhere
271         * where we allocate any clusters that lie in the page or that
272         * contain the page.
273         *
274         * TODO: An even greater optimization would be for us to only
275         * call readpage() on pages which are not in sparse regions as
276         * determined from the runlist. This would greatly reduce the
277         * number of pages we read and make dirty in the case of sparse
278         * files.
279         */
280        balance_dirty_pages_ratelimited(mapping);
281        cond_resched();
282    } while (++index < end_index);
283    read_lock_irqsave(&ni->size_lock, flags);
284    BUG_ON(ni->initialized_size != new_init_size);
285    read_unlock_irqrestore(&ni->size_lock, flags);
286    /* Now bring in sync the initialized_size in the mft record. */
287    m = map_mft_record(base_ni);
288    if (IS_ERR(m)) {
289        err = PTR_ERR(m);
290        m = NULL;
291        goto init_err_out;
292    }
293    ctx = ntfs_attr_get_search_ctx(base_ni, m);
294    if (unlikely(!ctx)) {
295        err = -ENOMEM;
296        goto init_err_out;
297    }
298    err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
299            CASE_SENSITIVE, 0, NULL, 0, ctx);
300    if (unlikely(err)) {
301        if (err == -ENOENT)
302            err = -EIO;
303        goto init_err_out;
304    }
305    m = ctx->mrec;
306    a = ctx->attr;
307    BUG_ON(!a->non_resident);
308    a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
309done:
310    flush_dcache_mft_record_page(ctx->ntfs_ino);
311    mark_mft_record_dirty(ctx->ntfs_ino);
312    if (ctx)
313        ntfs_attr_put_search_ctx(ctx);
314    if (m)
315        unmap_mft_record(base_ni);
316    ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
317            (unsigned long long)new_init_size, i_size_read(vi));
318    return 0;
319init_err_out:
320    write_lock_irqsave(&ni->size_lock, flags);
321    ni->initialized_size = old_init_size;
322    write_unlock_irqrestore(&ni->size_lock, flags);
323err_out:
324    if (ctx)
325        ntfs_attr_put_search_ctx(ctx);
326    if (m)
327        unmap_mft_record(base_ni);
328    ntfs_debug("Failed. Returning error code %i.", err);
329    return err;
330}
331
332/**
333 * ntfs_fault_in_pages_readable -
334 *
335 * Fault a number of userspace pages into pagetables.
336 *
337 * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
338 * with more than two userspace pages as well as handling the single page case
339 * elegantly.
340 *
341 * If you find this difficult to understand, then think of the while loop being
342 * the following code, except that we do without the integer variable ret:
343 *
344 * do {
345 * ret = __get_user(c, uaddr);
346 * uaddr += PAGE_SIZE;
347 * } while (!ret && uaddr < end);
348 *
349 * Note, the final __get_user() may well run out-of-bounds of the user buffer,
350 * but _not_ out-of-bounds of the page the user buffer belongs to, and since
351 * this is only a read and not a write, and since it is still in the same page,
352 * it should not matter and this makes the code much simpler.
353 */
354static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
355        int bytes)
356{
357    const char __user *end;
358    volatile char c;
359
360    /* Set @end to the first byte outside the last page we care about. */
361    end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
362
363    while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
364        ;
365}
366
367/**
368 * ntfs_fault_in_pages_readable_iovec -
369 *
370 * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
371 */
372static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
373        size_t iov_ofs, int bytes)
374{
375    do {
376        const char __user *buf;
377        unsigned len;
378
379        buf = iov->iov_base + iov_ofs;
380        len = iov->iov_len - iov_ofs;
381        if (len > bytes)
382            len = bytes;
383        ntfs_fault_in_pages_readable(buf, len);
384        bytes -= len;
385        iov++;
386        iov_ofs = 0;
387    } while (bytes);
388}
389
390/**
391 * __ntfs_grab_cache_pages - obtain a number of locked pages
392 * @mapping: address space mapping from which to obtain page cache pages
393 * @index: starting index in @mapping at which to begin obtaining pages
394 * @nr_pages: number of page cache pages to obtain
395 * @pages: array of pages in which to return the obtained page cache pages
396 * @cached_page: allocated but as yet unused page
397 * @lru_pvec: lru-buffering pagevec of caller
398 *
399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
400 * starting at index @index.
401 *
402 * If a page is newly created, add it to lru list
403 *
404 * Note, the page locks are obtained in ascending page index order.
405 */
406static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
407        pgoff_t index, const unsigned nr_pages, struct page **pages,
408        struct page **cached_page)
409{
410    int err, nr;
411
412    BUG_ON(!nr_pages);
413    err = nr = 0;
414    do {
415        pages[nr] = find_lock_page(mapping, index);
416        if (!pages[nr]) {
417            if (!*cached_page) {
418                *cached_page = page_cache_alloc(mapping);
419                if (unlikely(!*cached_page)) {
420                    err = -ENOMEM;
421                    goto err_out;
422                }
423            }
424            err = add_to_page_cache_lru(*cached_page, mapping, index,
425                    GFP_KERNEL);
426            if (unlikely(err)) {
427                if (err == -EEXIST)
428                    continue;
429                goto err_out;
430            }
431            pages[nr] = *cached_page;
432            *cached_page = NULL;
433        }
434        index++;
435        nr++;
436    } while (nr < nr_pages);
437out:
438    return err;
439err_out:
440    while (nr > 0) {
441        unlock_page(pages[--nr]);
442        page_cache_release(pages[nr]);
443    }
444    goto out;
445}
446
447static inline int ntfs_submit_bh_for_read(struct buffer_head *bh)
448{
449    lock_buffer(bh);
450    get_bh(bh);
451    bh->b_end_io = end_buffer_read_sync;
452    return submit_bh(READ, bh);
453}
454
455/**
456 * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
457 * @pages: array of destination pages
458 * @nr_pages: number of pages in @pages
459 * @pos: byte position in file at which the write begins
460 * @bytes: number of bytes to be written
461 *
462 * This is called for non-resident attributes from ntfs_file_buffered_write()
463 * with i_mutex held on the inode (@pages[0]->mapping->host). There are
464 * @nr_pages pages in @pages which are locked but not kmap()ped. The source
465 * data has not yet been copied into the @pages.
466 *
467 * Need to fill any holes with actual clusters, allocate buffers if necessary,
468 * ensure all the buffers are mapped, and bring uptodate any buffers that are
469 * only partially being written to.
470 *
471 * If @nr_pages is greater than one, we are guaranteed that the cluster size is
472 * greater than PAGE_CACHE_SIZE, that all pages in @pages are entirely inside
473 * the same cluster and that they are the entirety of that cluster, and that
474 * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
475 *
476 * i_size is not to be modified yet.
477 *
478 * Return 0 on success or -errno on error.
479 */
480static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
481        unsigned nr_pages, s64 pos, size_t bytes)
482{
483    VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
484    LCN lcn;
485    s64 bh_pos, vcn_len, end, initialized_size;
486    sector_t lcn_block;
487    struct page *page;
488    struct inode *vi;
489    ntfs_inode *ni, *base_ni = NULL;
490    ntfs_volume *vol;
491    runlist_element *rl, *rl2;
492    struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
493    ntfs_attr_search_ctx *ctx = NULL;
494    MFT_RECORD *m = NULL;
495    ATTR_RECORD *a = NULL;
496    unsigned long flags;
497    u32 attr_rec_len = 0;
498    unsigned blocksize, u;
499    int err, mp_size;
500    bool rl_write_locked, was_hole, is_retry;
501    unsigned char blocksize_bits;
502    struct {
503        u8 runlist_merged:1;
504        u8 mft_attr_mapped:1;
505        u8 mp_rebuilt:1;
506        u8 attr_switched:1;
507    } status = { 0, 0, 0, 0 };
508
509    BUG_ON(!nr_pages);
510    BUG_ON(!pages);
511    BUG_ON(!*pages);
512    vi = pages[0]->mapping->host;
513    ni = NTFS_I(vi);
514    vol = ni->vol;
515    ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
516            "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
517            vi->i_ino, ni->type, pages[0]->index, nr_pages,
518            (long long)pos, bytes);
519    blocksize = vol->sb->s_blocksize;
520    blocksize_bits = vol->sb->s_blocksize_bits;
521    u = 0;
522    do {
523        page = pages[u];
524        BUG_ON(!page);
525        /*
526         * create_empty_buffers() will create uptodate/dirty buffers if
527         * the page is uptodate/dirty.
528         */
529        if (!page_has_buffers(page)) {
530            create_empty_buffers(page, blocksize, 0);
531            if (unlikely(!page_has_buffers(page)))
532                return -ENOMEM;
533        }
534    } while (++u < nr_pages);
535    rl_write_locked = false;
536    rl = NULL;
537    err = 0;
538    vcn = lcn = -1;
539    vcn_len = 0;
540    lcn_block = -1;
541    was_hole = false;
542    cpos = pos >> vol->cluster_size_bits;
543    end = pos + bytes;
544    cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
545    /*
546     * Loop over each page and for each page over each buffer. Use goto to
547     * reduce indentation.
548     */
549    u = 0;
550do_next_page:
551    page = pages[u];
552    bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
553    bh = head = page_buffers(page);
554    do {
555        VCN cdelta;
556        s64 bh_end;
557        unsigned bh_cofs;
558
559        /* Clear buffer_new on all buffers to reinitialise state. */
560        if (buffer_new(bh))
561            clear_buffer_new(bh);
562        bh_end = bh_pos + blocksize;
563        bh_cpos = bh_pos >> vol->cluster_size_bits;
564        bh_cofs = bh_pos & vol->cluster_size_mask;
565        if (buffer_mapped(bh)) {
566            /*
567             * The buffer is already mapped. If it is uptodate,
568             * ignore it.
569             */
570            if (buffer_uptodate(bh))
571                continue;
572            /*
573             * The buffer is not uptodate. If the page is uptodate
574             * set the buffer uptodate and otherwise ignore it.
575             */
576            if (PageUptodate(page)) {
577                set_buffer_uptodate(bh);
578                continue;
579            }
580            /*
581             * Neither the page nor the buffer are uptodate. If
582             * the buffer is only partially being written to, we
583             * need to read it in before the write, i.e. now.
584             */
585            if ((bh_pos < pos && bh_end > pos) ||
586                    (bh_pos < end && bh_end > end)) {
587                /*
588                 * If the buffer is fully or partially within
589                 * the initialized size, do an actual read.
590                 * Otherwise, simply zero the buffer.
591                 */
592                read_lock_irqsave(&ni->size_lock, flags);
593                initialized_size = ni->initialized_size;
594                read_unlock_irqrestore(&ni->size_lock, flags);
595                if (bh_pos < initialized_size) {
596                    ntfs_submit_bh_for_read(bh);
597                    *wait_bh++ = bh;
598                } else {
599                    zero_user(page, bh_offset(bh),
600                            blocksize);
601                    set_buffer_uptodate(bh);
602                }
603            }
604            continue;
605        }
606        /* Unmapped buffer. Need to map it. */
607        bh->b_bdev = vol->sb->s_bdev;
608        /*
609         * If the current buffer is in the same clusters as the map
610         * cache, there is no need to check the runlist again. The
611         * map cache is made up of @vcn, which is the first cached file
612         * cluster, @vcn_len which is the number of cached file
613         * clusters, @lcn is the device cluster corresponding to @vcn,
614         * and @lcn_block is the block number corresponding to @lcn.
615         */
616        cdelta = bh_cpos - vcn;
617        if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
618map_buffer_cached:
619            BUG_ON(lcn < 0);
620            bh->b_blocknr = lcn_block +
621                    (cdelta << (vol->cluster_size_bits -
622                    blocksize_bits)) +
623                    (bh_cofs >> blocksize_bits);
624            set_buffer_mapped(bh);
625            /*
626             * If the page is uptodate so is the buffer. If the
627             * buffer is fully outside the write, we ignore it if
628             * it was already allocated and we mark it dirty so it
629             * gets written out if we allocated it. On the other
630             * hand, if we allocated the buffer but we are not
631             * marking it dirty we set buffer_new so we can do
632             * error recovery.
633             */
634            if (PageUptodate(page)) {
635                if (!buffer_uptodate(bh))
636                    set_buffer_uptodate(bh);
637                if (unlikely(was_hole)) {
638                    /* We allocated the buffer. */
639                    unmap_underlying_metadata(bh->b_bdev,
640                            bh->b_blocknr);
641                    if (bh_end <= pos || bh_pos >= end)
642                        mark_buffer_dirty(bh);
643                    else
644                        set_buffer_new(bh);
645                }
646                continue;
647            }
648            /* Page is _not_ uptodate. */
649            if (likely(!was_hole)) {
650                /*
651                 * Buffer was already allocated. If it is not
652                 * uptodate and is only partially being written
653                 * to, we need to read it in before the write,
654                 * i.e. now.
655                 */
656                if (!buffer_uptodate(bh) && bh_pos < end &&
657                        bh_end > pos &&
658                        (bh_pos < pos ||
659                        bh_end > end)) {
660                    /*
661                     * If the buffer is fully or partially
662                     * within the initialized size, do an
663                     * actual read. Otherwise, simply zero
664                     * the buffer.
665                     */
666                    read_lock_irqsave(&ni->size_lock,
667                            flags);
668                    initialized_size = ni->initialized_size;
669                    read_unlock_irqrestore(&ni->size_lock,
670                            flags);
671                    if (bh_pos < initialized_size) {
672                        ntfs_submit_bh_for_read(bh);
673                        *wait_bh++ = bh;
674                    } else {
675                        zero_user(page, bh_offset(bh),
676                                blocksize);
677                        set_buffer_uptodate(bh);
678                    }
679                }
680                continue;
681            }
682            /* We allocated the buffer. */
683            unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
684            /*
685             * If the buffer is fully outside the write, zero it,
686             * set it uptodate, and mark it dirty so it gets
687             * written out. If it is partially being written to,
688             * zero region surrounding the write but leave it to
689             * commit write to do anything else. Finally, if the
690             * buffer is fully being overwritten, do nothing.
691             */
692            if (bh_end <= pos || bh_pos >= end) {
693                if (!buffer_uptodate(bh)) {
694                    zero_user(page, bh_offset(bh),
695                            blocksize);
696                    set_buffer_uptodate(bh);
697                }
698                mark_buffer_dirty(bh);
699                continue;
700            }
701            set_buffer_new(bh);
702            if (!buffer_uptodate(bh) &&
703                    (bh_pos < pos || bh_end > end)) {
704                u8 *kaddr;
705                unsigned pofs;
706                    
707                kaddr = kmap_atomic(page, KM_USER0);
708                if (bh_pos < pos) {
709                    pofs = bh_pos & ~PAGE_CACHE_MASK;
710                    memset(kaddr + pofs, 0, pos - bh_pos);
711                }
712                if (bh_end > end) {
713                    pofs = end & ~PAGE_CACHE_MASK;
714                    memset(kaddr + pofs, 0, bh_end - end);
715                }
716                kunmap_atomic(kaddr, KM_USER0);
717                flush_dcache_page(page);
718            }
719            continue;
720        }
721        /*
722         * Slow path: this is the first buffer in the cluster. If it
723         * is outside allocated size and is not uptodate, zero it and
724         * set it uptodate.
725         */
726        read_lock_irqsave(&ni->size_lock, flags);
727        initialized_size = ni->allocated_size;
728        read_unlock_irqrestore(&ni->size_lock, flags);
729        if (bh_pos > initialized_size) {
730            if (PageUptodate(page)) {
731                if (!buffer_uptodate(bh))
732                    set_buffer_uptodate(bh);
733            } else if (!buffer_uptodate(bh)) {
734                zero_user(page, bh_offset(bh), blocksize);
735                set_buffer_uptodate(bh);
736            }
737            continue;
738        }
739        is_retry = false;
740        if (!rl) {
741            down_read(&ni->runlist.lock);
742retry_remap:
743            rl = ni->runlist.rl;
744        }
745        if (likely(rl != NULL)) {
746            /* Seek to element containing target cluster. */
747            while (rl->length && rl[1].vcn <= bh_cpos)
748                rl++;
749            lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
750            if (likely(lcn >= 0)) {
751                /*
752                 * Successful remap, setup the map cache and
753                 * use that to deal with the buffer.
754                 */
755                was_hole = false;
756                vcn = bh_cpos;
757                vcn_len = rl[1].vcn - vcn;
758                lcn_block = lcn << (vol->cluster_size_bits -
759                        blocksize_bits);
760                cdelta = 0;
761                /*
762                 * If the number of remaining clusters touched
763                 * by the write is smaller or equal to the
764                 * number of cached clusters, unlock the
765                 * runlist as the map cache will be used from
766                 * now on.
767                 */
768                if (likely(vcn + vcn_len >= cend)) {
769                    if (rl_write_locked) {
770                        up_write(&ni->runlist.lock);
771                        rl_write_locked = false;
772                    } else
773                        up_read(&ni->runlist.lock);
774                    rl = NULL;
775                }
776                goto map_buffer_cached;
777            }
778        } else
779            lcn = LCN_RL_NOT_MAPPED;
780        /*
781         * If it is not a hole and not out of bounds, the runlist is
782         * probably unmapped so try to map it now.
783         */
784        if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
785            if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
786                /* Attempt to map runlist. */
787                if (!rl_write_locked) {
788                    /*
789                     * We need the runlist locked for
790                     * writing, so if it is locked for
791                     * reading relock it now and retry in
792                     * case it changed whilst we dropped
793                     * the lock.
794                     */
795                    up_read(&ni->runlist.lock);
796                    down_write(&ni->runlist.lock);
797                    rl_write_locked = true;
798                    goto retry_remap;
799                }
800                err = ntfs_map_runlist_nolock(ni, bh_cpos,
801                        NULL);
802                if (likely(!err)) {
803                    is_retry = true;
804                    goto retry_remap;
805                }
806                /*
807                 * If @vcn is out of bounds, pretend @lcn is
808                 * LCN_ENOENT. As long as the buffer is out
809                 * of bounds this will work fine.
810                 */
811                if (err == -ENOENT) {
812                    lcn = LCN_ENOENT;
813                    err = 0;
814                    goto rl_not_mapped_enoent;
815                }
816            } else
817                err = -EIO;
818            /* Failed to map the buffer, even after retrying. */
819            bh->b_blocknr = -1;
820            ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
821                    "attribute type 0x%x, vcn 0x%llx, "
822                    "vcn offset 0x%x, because its "
823                    "location on disk could not be "
824                    "determined%s (error code %i).",
825                    ni->mft_no, ni->type,
826                    (unsigned long long)bh_cpos,
827                    (unsigned)bh_pos &
828                    vol->cluster_size_mask,
829                    is_retry ? " even after retrying" : "",
830                    err);
831            break;
832        }
833rl_not_mapped_enoent:
834        /*
835         * The buffer is in a hole or out of bounds. We need to fill
836         * the hole, unless the buffer is in a cluster which is not
837         * touched by the write, in which case we just leave the buffer
838         * unmapped. This can only happen when the cluster size is
839         * less than the page cache size.
840         */
841        if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
842            bh_cend = (bh_end + vol->cluster_size - 1) >>
843                    vol->cluster_size_bits;
844            if ((bh_cend <= cpos || bh_cpos >= cend)) {
845                bh->b_blocknr = -1;
846                /*
847                 * If the buffer is uptodate we skip it. If it
848                 * is not but the page is uptodate, we can set
849                 * the buffer uptodate. If the page is not
850                 * uptodate, we can clear the buffer and set it
851                 * uptodate. Whether this is worthwhile is
852                 * debatable and this could be removed.
853                 */
854                if (PageUptodate(page)) {
855                    if (!buffer_uptodate(bh))
856                        set_buffer_uptodate(bh);
857                } else if (!buffer_uptodate(bh)) {
858                    zero_user(page, bh_offset(bh),
859                        blocksize);
860                    set_buffer_uptodate(bh);
861                }
862                continue;
863            }
864        }
865        /*
866         * Out of bounds buffer is invalid if it was not really out of
867         * bounds.
868         */
869        BUG_ON(lcn != LCN_HOLE);
870        /*
871         * We need the runlist locked for writing, so if it is locked
872         * for reading relock it now and retry in case it changed
873         * whilst we dropped the lock.
874         */
875        BUG_ON(!rl);
876        if (!rl_write_locked) {
877            up_read(&ni->runlist.lock);
878            down_write(&ni->runlist.lock);
879            rl_write_locked = true;
880            goto retry_remap;
881        }
882        /* Find the previous last allocated cluster. */
883        BUG_ON(rl->lcn != LCN_HOLE);
884        lcn = -1;
885        rl2 = rl;
886        while (--rl2 >= ni->runlist.rl) {
887            if (rl2->lcn >= 0) {
888                lcn = rl2->lcn + rl2->length;
889                break;
890            }
891        }
892        rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
893                false);
894        if (IS_ERR(rl2)) {
895            err = PTR_ERR(rl2);
896            ntfs_debug("Failed to allocate cluster, error code %i.",
897                    err);
898            break;
899        }
900        lcn = rl2->lcn;
901        rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
902        if (IS_ERR(rl)) {
903            err = PTR_ERR(rl);
904            if (err != -ENOMEM)
905                err = -EIO;
906            if (ntfs_cluster_free_from_rl(vol, rl2)) {
907                ntfs_error(vol->sb, "Failed to release "
908                        "allocated cluster in error "
909                        "code path. Run chkdsk to "
910                        "recover the lost cluster.");
911                NVolSetErrors(vol);
912            }
913            ntfs_free(rl2);
914            break;
915        }
916        ni->runlist.rl = rl;
917        status.runlist_merged = 1;
918        ntfs_debug("Allocated cluster, lcn 0x%llx.",
919                (unsigned long long)lcn);
920        /* Map and lock the mft record and get the attribute record. */
921        if (!NInoAttr(ni))
922            base_ni = ni;
923        else
924            base_ni = ni->ext.base_ntfs_ino;
925        m = map_mft_record(base_ni);
926        if (IS_ERR(m)) {
927            err = PTR_ERR(m);
928            break;
929        }
930        ctx = ntfs_attr_get_search_ctx(base_ni, m);
931        if (unlikely(!ctx)) {
932            err = -ENOMEM;
933            unmap_mft_record(base_ni);
934            break;
935        }
936        status.mft_attr_mapped = 1;
937        err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
938                CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
939        if (unlikely(err)) {
940            if (err == -ENOENT)
941                err = -EIO;
942            break;
943        }
944        m = ctx->mrec;
945        a = ctx->attr;
946        /*
947         * Find the runlist element with which the attribute extent
948         * starts. Note, we cannot use the _attr_ version because we
949         * have mapped the mft record. That is ok because we know the
950         * runlist fragment must be mapped already to have ever gotten
951         * here, so we can just use the _rl_ version.
952         */
953        vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
954        rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
955        BUG_ON(!rl2);
956        BUG_ON(!rl2->length);
957        BUG_ON(rl2->lcn < LCN_HOLE);
958        highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
959        /*
960         * If @highest_vcn is zero, calculate the real highest_vcn
961         * (which can really be zero).
962         */
963        if (!highest_vcn)
964            highest_vcn = (sle64_to_cpu(
965                    a->data.non_resident.allocated_size) >>
966                    vol->cluster_size_bits) - 1;
967        /*
968         * Determine the size of the mapping pairs array for the new
969         * extent, i.e. the old extent with the hole filled.
970         */
971        mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
972                highest_vcn);
973        if (unlikely(mp_size <= 0)) {
974            if (!(err = mp_size))
975                err = -EIO;
976            ntfs_debug("Failed to get size for mapping pairs "
977                    "array, error code %i.", err);
978            break;
979        }
980        /*
981         * Resize the attribute record to fit the new mapping pairs
982         * array.
983         */
984        attr_rec_len = le32_to_cpu(a->length);
985        err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
986                a->data.non_resident.mapping_pairs_offset));
987        if (unlikely(err)) {
988            BUG_ON(err != -ENOSPC);
989            // TODO: Deal with this by using the current attribute
990            // and fill it with as much of the mapping pairs
991            // array as possible. Then loop over each attribute
992            // extent rewriting the mapping pairs arrays as we go
993            // along and if when we reach the end we have not
994            // enough space, try to resize the last attribute
995            // extent and if even that fails, add a new attribute
996            // extent.
997            // We could also try to resize at each step in the hope
998            // that we will not need to rewrite every single extent.
999            // Note, we may need to decompress some extents to fill
1000            // the runlist as we are walking the extents...
1001            ntfs_error(vol->sb, "Not enough space in the mft "
1002                    "record for the extended attribute "
1003                    "record. This case is not "
1004                    "implemented yet.");
1005            err = -EOPNOTSUPP;
1006            break ;
1007        }
1008        status.mp_rebuilt = 1;
1009        /*
1010         * Generate the mapping pairs array directly into the attribute
1011         * record.
1012         */
1013        err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
1014                a->data.non_resident.mapping_pairs_offset),
1015                mp_size, rl2, vcn, highest_vcn, NULL);
1016        if (unlikely(err)) {
1017            ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
1018                    "attribute type 0x%x, because building "
1019                    "the mapping pairs failed with error "
1020                    "code %i.", vi->i_ino,
1021                    (unsigned)le32_to_cpu(ni->type), err);
1022            err = -EIO;
1023            break;
1024        }
1025        /* Update the highest_vcn but only if it was not set. */
1026        if (unlikely(!a->data.non_resident.highest_vcn))
1027            a->data.non_resident.highest_vcn =
1028                    cpu_to_sle64(highest_vcn);
1029        /*
1030         * If the attribute is sparse/compressed, update the compressed
1031         * size in the ntfs_inode structure and the attribute record.
1032         */
1033        if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
1034            /*
1035             * If we are not in the first attribute extent, switch
1036             * to it, but first ensure the changes will make it to
1037             * disk later.
1038             */
1039            if (a->data.non_resident.lowest_vcn) {
1040                flush_dcache_mft_record_page(ctx->ntfs_ino);
1041                mark_mft_record_dirty(ctx->ntfs_ino);
1042                ntfs_attr_reinit_search_ctx(ctx);
1043                err = ntfs_attr_lookup(ni->type, ni->name,
1044                        ni->name_len, CASE_SENSITIVE,
1045                        0, NULL, 0, ctx);
1046                if (unlikely(err)) {
1047                    status.attr_switched = 1;
1048                    break;
1049                }
1050                /* @m is not used any more so do not set it. */
1051                a = ctx->attr;
1052            }
1053            write_lock_irqsave(&ni->size_lock, flags);
1054            ni->itype.compressed.size += vol->cluster_size;
1055            a->data.non_resident.compressed_size =
1056                    cpu_to_sle64(ni->itype.compressed.size);
1057            write_unlock_irqrestore(&ni->size_lock, flags);
1058        }
1059        /* Ensure the changes make it to disk. */
1060        flush_dcache_mft_record_page(ctx->ntfs_ino);
1061        mark_mft_record_dirty(ctx->ntfs_ino);
1062        ntfs_attr_put_search_ctx(ctx);
1063        unmap_mft_record(base_ni);
1064        /* Successfully filled the hole. */
1065        status.runlist_merged = 0;
1066        status.mft_attr_mapped = 0;
1067        status.mp_rebuilt = 0;
1068        /* Setup the map cache and use that to deal with the buffer. */
1069        was_hole = true;
1070        vcn = bh_cpos;
1071        vcn_len = 1;
1072        lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
1073        cdelta = 0;
1074        /*
1075         * If the number of remaining clusters in the @pages is smaller
1076         * or equal to the number of cached clusters, unlock the
1077         * runlist as the map cache will be used from now on.
1078         */
1079        if (likely(vcn + vcn_len >= cend)) {
1080            up_write(&ni->runlist.lock);
1081            rl_write_locked = false;
1082            rl = NULL;
1083        }
1084        goto map_buffer_cached;
1085    } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1086    /* If there are no errors, do the next page. */
1087    if (likely(!err && ++u < nr_pages))
1088        goto do_next_page;
1089    /* If there are no errors, release the runlist lock if we took it. */
1090    if (likely(!err)) {
1091        if (unlikely(rl_write_locked)) {
1092            up_write(&ni->runlist.lock);
1093            rl_write_locked = false;
1094        } else if (unlikely(rl))
1095            up_read(&ni->runlist.lock);
1096        rl = NULL;
1097    }
1098    /* If we issued read requests, let them complete. */
1099    read_lock_irqsave(&ni->size_lock, flags);
1100    initialized_size = ni->initialized_size;
1101    read_unlock_irqrestore(&ni->size_lock, flags);
1102    while (wait_bh > wait) {
1103        bh = *--wait_bh;
1104        wait_on_buffer(bh);
1105        if (likely(buffer_uptodate(bh))) {
1106            page = bh->b_page;
1107            bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
1108                    bh_offset(bh);
1109            /*
1110             * If the buffer overflows the initialized size, need
1111             * to zero the overflowing region.
1112             */
1113            if (unlikely(bh_pos + blocksize > initialized_size)) {
1114                int ofs = 0;
1115
1116                if (likely(bh_pos < initialized_size))
1117                    ofs = initialized_size - bh_pos;
1118                zero_user_segment(page, bh_offset(bh) + ofs,
1119                        blocksize);
1120            }
1121        } else /* if (unlikely(!buffer_uptodate(bh))) */
1122            err = -EIO;
1123    }
1124    if (likely(!err)) {
1125        /* Clear buffer_new on all buffers. */
1126        u = 0;
1127        do {
1128            bh = head = page_buffers(pages[u]);
1129            do {
1130                if (buffer_new(bh))
1131                    clear_buffer_new(bh);
1132            } while ((bh = bh->b_this_page) != head);
1133        } while (++u < nr_pages);
1134        ntfs_debug("Done.");
1135        return err;
1136    }
1137    if (status.attr_switched) {
1138        /* Get back to the attribute extent we modified. */
1139        ntfs_attr_reinit_search_ctx(ctx);
1140        if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1141                CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
1142            ntfs_error(vol->sb, "Failed to find required "
1143                    "attribute extent of attribute in "
1144                    "error code path. Run chkdsk to "
1145                    "recover.");
1146            write_lock_irqsave(&ni->size_lock, flags);
1147            ni->itype.compressed.size += vol->cluster_size;
1148            write_unlock_irqrestore(&ni->size_lock, flags);
1149            flush_dcache_mft_record_page(ctx->ntfs_ino);
1150            mark_mft_record_dirty(ctx->ntfs_ino);
1151            /*
1152             * The only thing that is now wrong is the compressed
1153             * size of the base attribute extent which chkdsk
1154             * should be able to fix.
1155             */
1156            NVolSetErrors(vol);
1157        } else {
1158            m = ctx->mrec;
1159            a = ctx->attr;
1160            status.attr_switched = 0;
1161        }
1162    }
1163    /*
1164     * If the runlist has been modified, need to restore it by punching a
1165     * hole into it and we then need to deallocate the on-disk cluster as
1166     * well. Note, we only modify the runlist if we are able to generate a
1167     * new mapping pairs array, i.e. only when the mapped attribute extent
1168     * is not switched.
1169     */
1170    if (status.runlist_merged && !status.attr_switched) {
1171        BUG_ON(!rl_write_locked);
1172        /* Make the file cluster we allocated sparse in the runlist. */
1173        if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
1174            ntfs_error(vol->sb, "Failed to punch hole into "
1175                    "attribute runlist in error code "
1176                    "path. Run chkdsk to recover the "
1177                    "lost cluster.");
1178            NVolSetErrors(vol);
1179        } else /* if (success) */ {
1180            status.runlist_merged = 0;
1181            /*
1182             * Deallocate the on-disk cluster we allocated but only
1183             * if we succeeded in punching its vcn out of the
1184             * runlist.
1185             */
1186            down_write(&vol->lcnbmp_lock);
1187            if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
1188                ntfs_error(vol->sb, "Failed to release "
1189                        "allocated cluster in error "
1190                        "code path. Run chkdsk to "
1191                        "recover the lost cluster.");
1192                NVolSetErrors(vol);
1193            }
1194            up_write(&vol->lcnbmp_lock);
1195        }
1196    }
1197    /*
1198     * Resize the attribute record to its old size and rebuild the mapping
1199     * pairs array. Note, we only can do this if the runlist has been
1200     * restored to its old state which also implies that the mapped
1201     * attribute extent is not switched.
1202     */
1203    if (status.mp_rebuilt && !status.runlist_merged) {
1204        if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
1205            ntfs_error(vol->sb, "Failed to restore attribute "
1206                    "record in error code path. Run "
1207                    "chkdsk to recover.");
1208            NVolSetErrors(vol);
1209        } else /* if (success) */ {
1210            if (ntfs_mapping_pairs_build(vol, (u8*)a +
1211                    le16_to_cpu(a->data.non_resident.
1212                    mapping_pairs_offset), attr_rec_len -
1213                    le16_to_cpu(a->data.non_resident.
1214                    mapping_pairs_offset), ni->runlist.rl,
1215                    vcn, highest_vcn, NULL)) {
1216                ntfs_error(vol->sb, "Failed to restore "
1217                        "mapping pairs array in error "
1218                        "code path. Run chkdsk to "
1219                        "recover.");
1220                NVolSetErrors(vol);
1221            }
1222            flush_dcache_mft_record_page(ctx->ntfs_ino);
1223            mark_mft_record_dirty(ctx->ntfs_ino);
1224        }
1225    }
1226    /* Release the mft record and the attribute. */
1227    if (status.mft_attr_mapped) {
1228        ntfs_attr_put_search_ctx(ctx);
1229        unmap_mft_record(base_ni);
1230    }
1231    /* Release the runlist lock. */
1232    if (rl_write_locked)
1233        up_write(&ni->runlist.lock);
1234    else if (rl)
1235        up_read(&ni->runlist.lock);
1236    /*
1237     * Zero out any newly allocated blocks to avoid exposing stale data.
1238     * If BH_New is set, we know that the block was newly allocated above
1239     * and that it has not been fully zeroed and marked dirty yet.
1240     */
1241    nr_pages = u;
1242    u = 0;
1243    end = bh_cpos << vol->cluster_size_bits;
1244    do {
1245        page = pages[u];
1246        bh = head = page_buffers(page);
1247        do {
1248            if (u == nr_pages &&
1249                    ((s64)page->index << PAGE_CACHE_SHIFT) +
1250                    bh_offset(bh) >= end)
1251                break;
1252            if (!buffer_new(bh))
1253                continue;
1254            clear_buffer_new(bh);
1255            if (!buffer_uptodate(bh)) {
1256                if (PageUptodate(page))
1257                    set_buffer_uptodate(bh);
1258                else {
1259                    zero_user(page, bh_offset(bh),
1260                            blocksize);
1261                    set_buffer_uptodate(bh);
1262                }
1263            }
1264            mark_buffer_dirty(bh);
1265        } while ((bh = bh->b_this_page) != head);
1266    } while (++u <= nr_pages);
1267    ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
1268    return err;
1269}
1270
1271/*
1272 * Copy as much as we can into the pages and return the number of bytes which
1273 * were successfully copied. If a fault is encountered then clear the pages
1274 * out to (ofs + bytes) and return the number of bytes which were copied.
1275 */
1276static inline size_t ntfs_copy_from_user(struct page **pages,
1277        unsigned nr_pages, unsigned ofs, const char __user *buf,
1278        size_t bytes)
1279{
1280    struct page **last_page = pages + nr_pages;
1281    char *addr;
1282    size_t total = 0;
1283    unsigned len;
1284    int left;
1285
1286    do {
1287        len = PAGE_CACHE_SIZE - ofs;
1288        if (len > bytes)
1289            len = bytes;
1290        addr = kmap_atomic(*pages, KM_USER0);
1291        left = __copy_from_user_inatomic(addr + ofs, buf, len);
1292        kunmap_atomic(addr, KM_USER0);
1293        if (unlikely(left)) {
1294            /* Do it the slow way. */
1295            addr = kmap(*pages);
1296            left = __copy_from_user(addr + ofs, buf, len);
1297            kunmap(*pages);
1298            if (unlikely(left))
1299                goto err_out;
1300        }
1301        total += len;
1302        bytes -= len;
1303        if (!bytes)
1304            break;
1305        buf += len;
1306        ofs = 0;
1307    } while (++pages < last_page);
1308out:
1309    return total;
1310err_out:
1311    total += len - left;
1312    /* Zero the rest of the target like __copy_from_user(). */
1313    while (++pages < last_page) {
1314        bytes -= len;
1315        if (!bytes)
1316            break;
1317        len = PAGE_CACHE_SIZE;
1318        if (len > bytes)
1319            len = bytes;
1320        zero_user(*pages, 0, len);
1321    }
1322    goto out;
1323}
1324
1325static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
1326        const struct iovec *iov, size_t iov_ofs, size_t bytes)
1327{
1328    size_t total = 0;
1329
1330    while (1) {
1331        const char __user *buf = iov->iov_base + iov_ofs;
1332        unsigned len;
1333        size_t left;
1334
1335        len = iov->iov_len - iov_ofs;
1336        if (len > bytes)
1337            len = bytes;
1338        left = __copy_from_user_inatomic(vaddr, buf, len);
1339        total += len;
1340        bytes -= len;
1341        vaddr += len;
1342        if (unlikely(left)) {
1343            total -= left;
1344            break;
1345        }
1346        if (!bytes)
1347            break;
1348        iov++;
1349        iov_ofs = 0;
1350    }
1351    return total;
1352}
1353
1354static inline void ntfs_set_next_iovec(const struct iovec **iovp,
1355        size_t *iov_ofsp, size_t bytes)
1356{
1357    const struct iovec *iov = *iovp;
1358    size_t iov_ofs = *iov_ofsp;
1359
1360    while (bytes) {
1361        unsigned len;
1362
1363        len = iov->iov_len - iov_ofs;
1364        if (len > bytes)
1365            len = bytes;
1366        bytes -= len;
1367        iov_ofs += len;
1368        if (iov->iov_len == iov_ofs) {
1369            iov++;
1370            iov_ofs = 0;
1371        }
1372    }
1373    *iovp = iov;
1374    *iov_ofsp = iov_ofs;
1375}
1376
1377/*
1378 * This has the same side-effects and return value as ntfs_copy_from_user().
1379 * The difference is that on a fault we need to memset the remainder of the
1380 * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
1381 * single-segment behaviour.
1382 *
1383 * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
1384 * atomic and when not atomic. This is ok because it calls
1385 * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
1386 * fact, the only difference between __copy_from_user_inatomic() and
1387 * __copy_from_user() is that the latter calls might_sleep() and the former
1388 * should not zero the tail of the buffer on error. And on many architectures
1389 * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
1390 * makes no difference at all on those architectures.
1391 */
1392static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
1393        unsigned nr_pages, unsigned ofs, const struct iovec **iov,
1394        size_t *iov_ofs, size_t bytes)
1395{
1396    struct page **last_page = pages + nr_pages;
1397    char *addr;
1398    size_t copied, len, total = 0;
1399
1400    do {
1401        len = PAGE_CACHE_SIZE - ofs;
1402        if (len > bytes)
1403            len = bytes;
1404        addr = kmap_atomic(*pages, KM_USER0);
1405        copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
1406                *iov, *iov_ofs, len);
1407        kunmap_atomic(addr, KM_USER0);
1408        if (unlikely(copied != len)) {
1409            /* Do it the slow way. */
1410            addr = kmap(*pages);
1411            copied = __ntfs_copy_from_user_iovec_inatomic(addr +
1412                    ofs, *iov, *iov_ofs, len);
1413            if (unlikely(copied != len))
1414                goto err_out;
1415            kunmap(*pages);
1416        }
1417        total += len;
1418        ntfs_set_next_iovec(iov, iov_ofs, len);
1419        bytes -= len;
1420        if (!bytes)
1421            break;
1422        ofs = 0;
1423    } while (++pages < last_page);
1424out:
1425    return total;
1426err_out:
1427    BUG_ON(copied > len);
1428    /* Zero the rest of the target like __copy_from_user(). */
1429    memset(addr + ofs + copied, 0, len - copied);
1430    kunmap(*pages);
1431    total += copied;
1432    ntfs_set_next_iovec(iov, iov_ofs, copied);
1433    while (++pages < last_page) {
1434        bytes -= len;
1435        if (!bytes)
1436            break;
1437        len = PAGE_CACHE_SIZE;
1438        if (len > bytes)
1439            len = bytes;
1440        zero_user(*pages, 0, len);
1441    }
1442    goto out;
1443}
1444
1445static inline void ntfs_flush_dcache_pages(struct page **pages,
1446        unsigned nr_pages)
1447{
1448    BUG_ON(!nr_pages);
1449    /*
1450     * Warning: Do not do the decrement at the same time as the call to
1451     * flush_dcache_page() because it is a NULL macro on i386 and hence the
1452     * decrement never happens so the loop never terminates.
1453     */
1454    do {
1455        --nr_pages;
1456        flush_dcache_page(pages[nr_pages]);
1457    } while (nr_pages > 0);
1458}
1459
1460/**
1461 * ntfs_commit_pages_after_non_resident_write - commit the received data
1462 * @pages: array of destination pages
1463 * @nr_pages: number of pages in @pages
1464 * @pos: byte position in file at which the write begins
1465 * @bytes: number of bytes to be written
1466 *
1467 * See description of ntfs_commit_pages_after_write(), below.
1468 */
1469static inline int ntfs_commit_pages_after_non_resident_write(
1470        struct page **pages, const unsigned nr_pages,
1471        s64 pos, size_t bytes)
1472{
1473    s64 end, initialized_size;
1474    struct inode *vi;
1475    ntfs_inode *ni, *base_ni;
1476    struct buffer_head *bh, *head;
1477    ntfs_attr_search_ctx *ctx;
1478    MFT_RECORD *m;
1479    ATTR_RECORD *a;
1480    unsigned long flags;
1481    unsigned blocksize, u;
1482    int err;
1483
1484    vi = pages[0]->mapping->host;
1485    ni = NTFS_I(vi);
1486    blocksize = vi->i_sb->s_blocksize;
1487    end = pos + bytes;
1488    u = 0;
1489    do {
1490        s64 bh_pos;
1491        struct page *page;
1492        bool partial;
1493
1494        page = pages[u];
1495        bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
1496        bh = head = page_buffers(page);
1497        partial = false;
1498        do {
1499            s64 bh_end;
1500
1501            bh_end = bh_pos + blocksize;
1502            if (bh_end <= pos || bh_pos >= end) {
1503                if (!buffer_uptodate(bh))
1504                    partial = true;
1505            } else {
1506                set_buffer_uptodate(bh);
1507                mark_buffer_dirty(bh);
1508            }
1509        } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
1510        /*
1511         * If all buffers are now uptodate but the page is not, set the
1512         * page uptodate.
1513         */
1514        if (!partial && !PageUptodate(page))
1515            SetPageUptodate(page);
1516    } while (++u < nr_pages);
1517    /*
1518     * Finally, if we do not need to update initialized_size or i_size we
1519     * are finished.
1520     */
1521    read_lock_irqsave(&ni->size_lock, flags);
1522    initialized_size = ni->initialized_size;
1523    read_unlock_irqrestore(&ni->size_lock, flags);
1524    if (end <= initialized_size) {
1525        ntfs_debug("Done.");
1526        return 0;
1527    }
1528    /*
1529     * Update initialized_size/i_size as appropriate, both in the inode and
1530     * the mft record.
1531     */
1532    if (!NInoAttr(ni))
1533        base_ni = ni;
1534    else
1535        base_ni = ni->ext.base_ntfs_ino;
1536    /* Map, pin, and lock the mft record. */
1537    m = map_mft_record(base_ni);
1538    if (IS_ERR(m)) {
1539        err = PTR_ERR(m);
1540        m = NULL;
1541        ctx = NULL;
1542        goto err_out;
1543    }
1544    BUG_ON(!NInoNonResident(ni));
1545    ctx = ntfs_attr_get_search_ctx(base_ni, m);
1546    if (unlikely(!ctx)) {
1547        err = -ENOMEM;
1548        goto err_out;
1549    }
1550    err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1551            CASE_SENSITIVE, 0, NULL, 0, ctx);
1552    if (unlikely(err)) {
1553        if (err == -ENOENT)
1554            err = -EIO;
1555        goto err_out;
1556    }
1557    a = ctx->attr;
1558    BUG_ON(!a->non_resident);
1559    write_lock_irqsave(&ni->size_lock, flags);
1560    BUG_ON(end > ni->allocated_size);
1561    ni->initialized_size = end;
1562    a->data.non_resident.initialized_size = cpu_to_sle64(end);
1563    if (end > i_size_read(vi)) {
1564        i_size_write(vi, end);
1565        a->data.non_resident.data_size =
1566                a->data.non_resident.initialized_size;
1567    }
1568    write_unlock_irqrestore(&ni->size_lock, flags);
1569    /* Mark the mft record dirty, so it gets written back. */
1570    flush_dcache_mft_record_page(ctx->ntfs_ino);
1571    mark_mft_record_dirty(ctx->ntfs_ino);
1572    ntfs_attr_put_search_ctx(ctx);
1573    unmap_mft_record(base_ni);
1574    ntfs_debug("Done.");
1575    return 0;
1576err_out:
1577    if (ctx)
1578        ntfs_attr_put_search_ctx(ctx);
1579    if (m)
1580        unmap_mft_record(base_ni);
1581    ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
1582            "code %i).", err);
1583    if (err != -ENOMEM)
1584        NVolSetErrors(ni->vol);
1585    return err;
1586}
1587
1588/**
1589 * ntfs_commit_pages_after_write - commit the received data
1590 * @pages: array of destination pages
1591 * @nr_pages: number of pages in @pages
1592 * @pos: byte position in file at which the write begins
1593 * @bytes: number of bytes to be written
1594 *
1595 * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
1596 * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
1597 * locked but not kmap()ped. The source data has already been copied into the
1598 * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
1599 * the data was copied (for non-resident attributes only) and it returned
1600 * success.
1601 *
1602 * Need to set uptodate and mark dirty all buffers within the boundary of the
1603 * write. If all buffers in a page are uptodate we set the page uptodate, too.
1604 *
1605 * Setting the buffers dirty ensures that they get written out later when
1606 * ntfs_writepage() is invoked by the VM.
1607 *
1608 * Finally, we need to update i_size and initialized_size as appropriate both
1609 * in the inode and the mft record.
1610 *
1611 * This is modelled after fs/buffer.c::generic_commit_write(), which marks
1612 * buffers uptodate and dirty, sets the page uptodate if all buffers in the
1613 * page are uptodate, and updates i_size if the end of io is beyond i_size. In
1614 * that case, it also marks the inode dirty.
1615 *
1616 * If things have gone as outlined in
1617 * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
1618 * content modifications here for non-resident attributes. For resident
1619 * attributes we need to do the uptodate bringing here which we combine with
1620 * the copying into the mft record which means we save one atomic kmap.
1621 *
1622 * Return 0 on success or -errno on error.
1623 */
1624static int ntfs_commit_pages_after_write(struct page **pages,
1625        const unsigned nr_pages, s64 pos, size_t bytes)
1626{
1627    s64 end, initialized_size;
1628    loff_t i_size;
1629    struct inode *vi;
1630    ntfs_inode *ni, *base_ni;
1631    struct page *page;
1632    ntfs_attr_search_ctx *ctx;
1633    MFT_RECORD *m;
1634    ATTR_RECORD *a;
1635    char *kattr, *kaddr;
1636    unsigned long flags;
1637    u32 attr_len;
1638    int err;
1639
1640    BUG_ON(!nr_pages);
1641    BUG_ON(!pages);
1642    page = pages[0];
1643    BUG_ON(!page);
1644    vi = page->mapping->host;
1645    ni = NTFS_I(vi);
1646    ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
1647            "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
1648            vi->i_ino, ni->type, page->index, nr_pages,
1649            (long long)pos, bytes);
1650    if (NInoNonResident(ni))
1651        return ntfs_commit_pages_after_non_resident_write(pages,
1652                nr_pages, pos, bytes);
1653    BUG_ON(nr_pages > 1);
1654    /*
1655     * Attribute is resident, implying it is not compressed, encrypted, or
1656     * sparse.
1657     */
1658    if (!NInoAttr(ni))
1659        base_ni = ni;
1660    else
1661        base_ni = ni->ext.base_ntfs_ino;
1662    BUG_ON(NInoNonResident(ni));
1663    /* Map, pin, and lock the mft record. */
1664    m = map_mft_record(base_ni);
1665    if (IS_ERR(m)) {
1666        err = PTR_ERR(m);
1667        m = NULL;
1668        ctx = NULL;
1669        goto err_out;
1670    }
1671    ctx = ntfs_attr_get_search_ctx(base_ni, m);
1672    if (unlikely(!ctx)) {
1673        err = -ENOMEM;
1674        goto err_out;
1675    }
1676    err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1677            CASE_SENSITIVE, 0, NULL, 0, ctx);
1678    if (unlikely(err)) {
1679        if (err == -ENOENT)
1680            err = -EIO;
1681        goto err_out;
1682    }
1683    a = ctx->attr;
1684    BUG_ON(a->non_resident);
1685    /* The total length of the attribute value. */
1686    attr_len = le32_to_cpu(a->data.resident.value_length);
1687    i_size = i_size_read(vi);
1688    BUG_ON(attr_len != i_size);
1689    BUG_ON(pos > attr_len);
1690    end = pos + bytes;
1691    BUG_ON(end > le32_to_cpu(a->length) -
1692            le16_to_cpu(a->data.resident.value_offset));
1693    kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
1694    kaddr = kmap_atomic(page, KM_USER0);
1695    /* Copy the received data from the page to the mft record. */
1696    memcpy(kattr + pos, kaddr + pos, bytes);
1697    /* Update the attribute length if necessary. */
1698    if (end > attr_len) {
1699        attr_len = end;
1700        a->data.resident.value_length = cpu_to_le32(attr_len);
1701    }
1702    /*
1703     * If the page is not uptodate, bring the out of bounds area(s)
1704     * uptodate by copying data from the mft record to the page.
1705     */
1706    if (!PageUptodate(page)) {
1707        if (pos > 0)
1708            memcpy(kaddr, kattr, pos);
1709        if (end < attr_len)
1710            memcpy(kaddr + end, kattr + end, attr_len - end);
1711        /* Zero the region outside the end of the attribute value. */
1712        memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
1713        flush_dcache_page(page);
1714        SetPageUptodate(page);
1715    }
1716    kunmap_atomic(kaddr, KM_USER0);
1717    /* Update initialized_size/i_size if necessary. */
1718    read_lock_irqsave(&ni->size_lock, flags);
1719    initialized_size = ni->initialized_size;
1720    BUG_ON(end > ni->allocated_size);
1721    read_unlock_irqrestore(&ni->size_lock, flags);
1722    BUG_ON(initialized_size != i_size);
1723    if (end > initialized_size) {
1724        write_lock_irqsave(&ni->size_lock, flags);
1725        ni->initialized_size = end;
1726        i_size_write(vi, end);
1727        write_unlock_irqrestore(&ni->size_lock, flags);
1728    }
1729    /* Mark the mft record dirty, so it gets written back. */
1730    flush_dcache_mft_record_page(ctx->ntfs_ino);
1731    mark_mft_record_dirty(ctx->ntfs_ino);
1732    ntfs_attr_put_search_ctx(ctx);
1733    unmap_mft_record(base_ni);
1734    ntfs_debug("Done.");
1735    return 0;
1736err_out:
1737    if (err == -ENOMEM) {
1738        ntfs_warning(vi->i_sb, "Error allocating memory required to "
1739                "commit the write.");
1740        if (PageUptodate(page)) {
1741            ntfs_warning(vi->i_sb, "Page is uptodate, setting "
1742                    "dirty so the write will be retried "
1743                    "later on by the VM.");
1744            /*
1745             * Put the page on mapping->dirty_pages, but leave its
1746             * buffers' dirty state as-is.
1747             */
1748            __set_page_dirty_nobuffers(page);
1749            err = 0;
1750        } else
1751            ntfs_error(vi->i_sb, "Page is not uptodate. Written "
1752                    "data has been lost.");
1753    } else {
1754        ntfs_error(vi->i_sb, "Resident attribute commit write failed "
1755                "with error %i.", err);
1756        NVolSetErrors(ni->vol);
1757    }
1758    if (ctx)
1759        ntfs_attr_put_search_ctx(ctx);
1760    if (m)
1761        unmap_mft_record(base_ni);
1762    return err;
1763}
1764
1765/**
1766 * ntfs_file_buffered_write -
1767 *
1768 * Locking: The vfs is holding ->i_mutex on the inode.
1769 */
1770static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1771        const struct iovec *iov, unsigned long nr_segs,
1772        loff_t pos, loff_t *ppos, size_t count)
1773{
1774    struct file *file = iocb->ki_filp;
1775    struct address_space *mapping = file->f_mapping;
1776    struct inode *vi = mapping->host;
1777    ntfs_inode *ni = NTFS_I(vi);
1778    ntfs_volume *vol = ni->vol;
1779    struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
1780    struct page *cached_page = NULL;
1781    char __user *buf = NULL;
1782    s64 end, ll;
1783    VCN last_vcn;
1784    LCN lcn;
1785    unsigned long flags;
1786    size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
1787    ssize_t status, written;
1788    unsigned nr_pages;
1789    int err;
1790
1791    ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1792            "pos 0x%llx, count 0x%lx.",
1793            vi->i_ino, (unsigned)le32_to_cpu(ni->type),
1794            (unsigned long long)pos, (unsigned long)count);
1795    if (unlikely(!count))
1796        return 0;
1797    BUG_ON(NInoMstProtected(ni));
1798    /*
1799     * If the attribute is not an index root and it is encrypted or
1800     * compressed, we cannot write to it yet. Note we need to check for
1801     * AT_INDEX_ALLOCATION since this is the type of both directory and
1802     * index inodes.
1803     */
1804    if (ni->type != AT_INDEX_ALLOCATION) {
1805        /* If file is encrypted, deny access, just like NT4. */
1806        if (NInoEncrypted(ni)) {
1807            /*
1808             * Reminder for later: Encrypted files are _always_
1809             * non-resident so that the content can always be
1810             * encrypted.
1811             */
1812            ntfs_debug("Denying write access to encrypted file.");
1813            return -EACCES;
1814        }
1815        if (NInoCompressed(ni)) {
1816            /* Only unnamed $DATA attribute can be compressed. */
1817            BUG_ON(ni->type != AT_DATA);
1818            BUG_ON(ni->name_len);
1819            /*
1820             * Reminder for later: If resident, the data is not
1821             * actually compressed. Only on the switch to non-
1822             * resident does compression kick in. This is in
1823             * contrast to encrypted files (see above).
1824             */
1825            ntfs_error(vi->i_sb, "Writing to compressed files is "
1826                    "not implemented yet. Sorry.");
1827            return -EOPNOTSUPP;
1828        }
1829    }
1830    /*
1831     * If a previous ntfs_truncate() failed, repeat it and abort if it
1832     * fails again.
1833     */
1834    if (unlikely(NInoTruncateFailed(ni))) {
1835        down_write(&vi->i_alloc_sem);
1836        err = ntfs_truncate(vi);
1837        up_write(&vi->i_alloc_sem);
1838        if (err || NInoTruncateFailed(ni)) {
1839            if (!err)
1840                err = -EIO;
1841            ntfs_error(vol->sb, "Cannot perform write to inode "
1842                    "0x%lx, attribute type 0x%x, because "
1843                    "ntfs_truncate() failed (error code "
1844                    "%i).", vi->i_ino,
1845                    (unsigned)le32_to_cpu(ni->type), err);
1846            return err;
1847        }
1848    }
1849    /* The first byte after the write. */
1850    end = pos + count;
1851    /*
1852     * If the write goes beyond the allocated size, extend the allocation
1853     * to cover the whole of the write, rounded up to the nearest cluster.
1854     */
1855    read_lock_irqsave(&ni->size_lock, flags);
1856    ll = ni->allocated_size;
1857    read_unlock_irqrestore(&ni->size_lock, flags);
1858    if (end > ll) {
1859        /* Extend the allocation without changing the data size. */
1860        ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
1861        if (likely(ll >= 0)) {
1862            BUG_ON(pos >= ll);
1863            /* If the extension was partial truncate the write. */
1864            if (end > ll) {
1865                ntfs_debug("Truncating write to inode 0x%lx, "
1866                        "attribute type 0x%x, because "
1867                        "the allocation was only "
1868                        "partially extended.",
1869                        vi->i_ino, (unsigned)
1870                        le32_to_cpu(ni->type));
1871                end = ll;
1872                count = ll - pos;
1873            }
1874        } else {
1875            err = ll;
1876            read_lock_irqsave(&ni->size_lock, flags);
1877            ll = ni->allocated_size;
1878            read_unlock_irqrestore(&ni->size_lock, flags);
1879            /* Perform a partial write if possible or fail. */
1880            if (pos < ll) {
1881                ntfs_debug("Truncating write to inode 0x%lx, "
1882                        "attribute type 0x%x, because "
1883                        "extending the allocation "
1884                        "failed (error code %i).",
1885                        vi->i_ino, (unsigned)
1886                        le32_to_cpu(ni->type), err);
1887                end = ll;
1888                count = ll - pos;
1889            } else {
1890                ntfs_error(vol->sb, "Cannot perform write to "
1891                        "inode 0x%lx, attribute type "
1892                        "0x%x, because extending the "
1893                        "allocation failed (error "
1894                        "code %i).", vi->i_ino,
1895                        (unsigned)
1896                        le32_to_cpu(ni->type), err);
1897                return err;
1898            }
1899        }
1900    }
1901    written = 0;
1902    /*
1903     * If the write starts beyond the initialized size, extend it up to the
1904     * beginning of the write and initialize all non-sparse space between
1905     * the old initialized size and the new one. This automatically also
1906     * increments the vfs inode->i_size to keep it above or equal to the
1907     * initialized_size.
1908     */
1909    read_lock_irqsave(&ni->size_lock, flags);
1910    ll = ni->initialized_size;
1911    read_unlock_irqrestore(&ni->size_lock, flags);
1912    if (pos > ll) {
1913        err = ntfs_attr_extend_initialized(ni, pos);
1914        if (err < 0) {
1915            ntfs_error(vol->sb, "Cannot perform write to inode "
1916                    "0x%lx, attribute type 0x%x, because "
1917                    "extending the initialized size "
1918                    "failed (error code %i).", vi->i_ino,
1919                    (unsigned)le32_to_cpu(ni->type), err);
1920            status = err;
1921            goto err_out;
1922        }
1923    }
1924    /*
1925     * Determine the number of pages per cluster for non-resident
1926     * attributes.
1927     */
1928    nr_pages = 1;
1929    if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
1930        nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
1931    /* Finally, perform the actual write. */
1932    last_vcn = -1;
1933    if (likely(nr_segs == 1))
1934        buf = iov->iov_base;
1935    do {
1936        VCN vcn;
1937        pgoff_t idx, start_idx;
1938        unsigned ofs, do_pages, u;
1939        size_t copied;
1940
1941        start_idx = idx = pos >> PAGE_CACHE_SHIFT;
1942        ofs = pos & ~PAGE_CACHE_MASK;
1943        bytes = PAGE_CACHE_SIZE - ofs;
1944        do_pages = 1;
1945        if (nr_pages > 1) {
1946            vcn = pos >> vol->cluster_size_bits;
1947            if (vcn != last_vcn) {
1948                last_vcn = vcn;
1949                /*
1950                 * Get the lcn of the vcn the write is in. If
1951                 * it is a hole, need to lock down all pages in
1952                 * the cluster.
1953                 */
1954                down_read(&ni->runlist.lock);
1955                lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
1956                        vol->cluster_size_bits, false);
1957                up_read(&ni->runlist.lock);
1958                if (unlikely(lcn < LCN_HOLE)) {
1959                    status = -EIO;
1960                    if (lcn == LCN_ENOMEM)
1961                        status = -ENOMEM;
1962                    else
1963                        ntfs_error(vol->sb, "Cannot "
1964                            "perform write to "
1965                            "inode 0x%lx, "
1966                            "attribute type 0x%x, "
1967                            "because the attribute "
1968                            "is corrupt.",
1969                            vi->i_ino, (unsigned)
1970                            le32_to_cpu(ni->type));
1971                    break;
1972                }
1973                if (lcn == LCN_HOLE) {
1974                    start_idx = (pos & ~(s64)
1975                            vol->cluster_size_mask)
1976                            >> PAGE_CACHE_SHIFT;
1977                    bytes = vol->cluster_size - (pos &
1978                            vol->cluster_size_mask);
1979                    do_pages = nr_pages;
1980                }
1981            }
1982        }
1983        if (bytes > count)
1984            bytes = count;
1985        /*
1986         * Bring in the user page(s) that we will copy from _first_.
1987         * Otherwise there is a nasty deadlock on copying from the same
1988         * page(s) as we are writing to, without it/them being marked
1989         * up-to-date. Note, at present there is nothing to stop the
1990         * pages being swapped out between us bringing them into memory
1991         * and doing the actual copying.
1992         */
1993        if (likely(nr_segs == 1))
1994            ntfs_fault_in_pages_readable(buf, bytes);
1995        else
1996            ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
1997        /* Get and lock @do_pages starting at index @start_idx. */
1998        status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
1999                pages, &cached_page);
2000        if (unlikely(status))
2001            break;
2002        /*
2003         * For non-resident attributes, we need to fill any holes with
2004         * actual clusters and ensure all bufferes are mapped. We also
2005         * need to bring uptodate any buffers that are only partially
2006         * being written to.
2007         */
2008        if (NInoNonResident(ni)) {
2009            status = ntfs_prepare_pages_for_non_resident_write(
2010                    pages, do_pages, pos, bytes);
2011            if (unlikely(status)) {
2012                loff_t i_size;
2013
2014                do {
2015                    unlock_page(pages[--do_pages]);
2016                    page_cache_release(pages[do_pages]);
2017                } while (do_pages);
2018                /*
2019                 * The write preparation may have instantiated
2020                 * allocated space outside i_size. Trim this
2021                 * off again. We can ignore any errors in this
2022                 * case as we will just be waisting a bit of
2023                 * allocated space, which is not a disaster.
2024                 */
2025                i_size = i_size_read(vi);
2026                if (pos + bytes > i_size)
2027                    vmtruncate(vi, i_size);
2028                break;
2029            }
2030        }
2031        u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
2032        if (likely(nr_segs == 1)) {
2033            copied = ntfs_copy_from_user(pages + u, do_pages - u,
2034                    ofs, buf, bytes);
2035            buf += copied;
2036        } else
2037            copied = ntfs_copy_from_user_iovec(pages + u,
2038                    do_pages - u, ofs, &iov, &iov_ofs,
2039                    bytes);
2040        ntfs_flush_dcache_pages(pages + u, do_pages - u);
2041        status = ntfs_commit_pages_after_write(pages, do_pages, pos,
2042                bytes);
2043        if (likely(!status)) {
2044            written += copied;
2045            count -= copied;
2046            pos += copied;
2047            if (unlikely(copied != bytes))
2048                status = -EFAULT;
2049        }
2050        do {
2051            unlock_page(pages[--do_pages]);
2052            mark_page_accessed(pages[do_pages]);
2053            page_cache_release(pages[do_pages]);
2054        } while (do_pages);
2055        if (unlikely(status))
2056            break;
2057        balance_dirty_pages_ratelimited(mapping);
2058        cond_resched();
2059    } while (count);
2060err_out:
2061    *ppos = pos;
2062    if (cached_page)
2063        page_cache_release(cached_page);
2064    ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2065            written ? "written" : "status", (unsigned long)written,
2066            (long)status);
2067    return written ? written : status;
2068}
2069
2070/**
2071 * ntfs_file_aio_write_nolock -
2072 */
2073static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
2074        const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
2075{
2076    struct file *file = iocb->ki_filp;
2077    struct address_space *mapping = file->f_mapping;
2078    struct inode *inode = mapping->host;
2079    loff_t pos;
2080    size_t count; /* after file limit checks */
2081    ssize_t written, err;
2082
2083    count = 0;
2084    err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
2085    if (err)
2086        return err;
2087    pos = *ppos;
2088    vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2089    /* We can write back this queue in page reclaim. */
2090    current->backing_dev_info = mapping->backing_dev_info;
2091    written = 0;
2092    err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2093    if (err)
2094        goto out;
2095    if (!count)
2096        goto out;
2097    err = file_remove_suid(file);
2098    if (err)
2099        goto out;
2100    file_update_time(file);
2101    written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
2102            count);
2103out:
2104    current->backing_dev_info = NULL;
2105    return written ? written : err;
2106}
2107
2108/**
2109 * ntfs_file_aio_write -
2110 */
2111static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2112        unsigned long nr_segs, loff_t pos)
2113{
2114    struct file *file = iocb->ki_filp;
2115    struct address_space *mapping = file->f_mapping;
2116    struct inode *inode = mapping->host;
2117    ssize_t ret;
2118
2119    BUG_ON(iocb->ki_pos != pos);
2120
2121    mutex_lock(&inode->i_mutex);
2122    ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
2123    mutex_unlock(&inode->i_mutex);
2124    if (ret > 0) {
2125        int err = generic_write_sync(file, pos, ret);
2126        if (err < 0)
2127            ret = err;
2128    }
2129    return ret;
2130}
2131
2132/**
2133 * ntfs_file_fsync - sync a file to disk
2134 * @filp: file to be synced
2135 * @datasync: if non-zero only flush user data and not metadata
2136 *
2137 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
2138 * system calls. This function is inspired by fs/buffer.c::file_fsync().
2139 *
2140 * If @datasync is false, write the mft record and all associated extent mft
2141 * records as well as the $DATA attribute and then sync the block device.
2142 *
2143 * If @datasync is true and the attribute is non-resident, we skip the writing
2144 * of the mft record and all associated extent mft records (this might still
2145 * happen due to the write_inode_now() call).
2146 *
2147 * Also, if @datasync is true, we do not wait on the inode to be written out
2148 * but we always wait on the page cache pages to be written out.
2149 *
2150 * Locking: Caller must hold i_mutex on the inode.
2151 *
2152 * TODO: We should probably also write all attribute/index inodes associated
2153 * with this inode but since we have no simple way of getting to them we ignore
2154 * this problem for now.
2155 */
2156static int ntfs_file_fsync(struct file *filp, int datasync)
2157{
2158    struct inode *vi = filp->f_mapping->host;
2159    int err, ret = 0;
2160
2161    ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
2162    BUG_ON(S_ISDIR(vi->i_mode));
2163    if (!datasync || !NInoNonResident(NTFS_I(vi)))
2164        ret = __ntfs_write_inode(vi, 1);
2165    write_inode_now(vi, !datasync);
2166    /*
2167     * NOTE: If we were to use mapping->private_list (see ext2 and
2168     * fs/buffer.c) for dirty blocks then we could optimize the below to be
2169     * sync_mapping_buffers(vi->i_mapping).
2170     */
2171    err = sync_blockdev(vi->i_sb->s_bdev);
2172    if (unlikely(err && !ret))
2173        ret = err;
2174    if (likely(!ret))
2175        ntfs_debug("Done.");
2176    else
2177        ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
2178                "%u.", datasync ? "data" : "", vi->i_ino, -ret);
2179    return ret;
2180}
2181
2182#endif /* NTFS_RW */
2183
2184const struct file_operations ntfs_file_ops = {
2185    .llseek = generic_file_llseek, /* Seek inside file. */
2186    .read = do_sync_read, /* Read from file. */
2187    .aio_read = generic_file_aio_read, /* Async read from file. */
2188#ifdef NTFS_RW
2189    .write = do_sync_write, /* Write to file. */
2190    .aio_write = ntfs_file_aio_write, /* Async write to file. */
2191    /*.release = ,*/ /* Last file is closed. See
2192                            fs/ext2/file.c::
2193                            ext2_release_file() for
2194                            how to use this to discard
2195                            preallocated space for
2196                            write opened files. */
2197    .fsync = ntfs_file_fsync, /* Sync a file to disk. */
2198    /*.aio_fsync = ,*/ /* Sync all outstanding async
2199                            i/o operations on a
2200                            kiocb. */
2201#endif /* NTFS_RW */
2202    /*.ioctl = ,*/ /* Perform function on the
2203                            mounted filesystem. */
2204    .mmap = generic_file_mmap, /* Mmap file. */
2205    .open = ntfs_file_open, /* Open file. */
2206    .splice_read = generic_file_splice_read /* Zero-copy data send with
2207                            the data source being on
2208                            the ntfs partition. We do
2209                            not need to care about the
2210                            data destination. */
2211    /*.sendpage = ,*/ /* Zero-copy data send with
2212                            the data destination being
2213                            on the ntfs partition. We
2214                            do not need to care about
2215                            the data source. */
2216};
2217
2218const struct inode_operations ntfs_file_inode_ops = {
2219#ifdef NTFS_RW
2220    .truncate = ntfs_truncate_vfs,
2221    .setattr = ntfs_setattr,
2222#endif /* NTFS_RW */
2223};
2224
2225const struct file_operations ntfs_empty_file_ops = {};
2226
2227const struct inode_operations ntfs_empty_inode_ops = {};
2228

Archive Download this file



interactive