Root/fs/ocfs2/file.c

1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * file.c
5 *
6 * File open, close, extend, truncate
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/capability.h>
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31#include <linux/pagemap.h>
32#include <linux/uio.h>
33#include <linux/sched.h>
34#include <linux/splice.h>
35#include <linux/mount.h>
36#include <linux/writeback.h>
37#include <linux/falloc.h>
38#include <linux/quotaops.h>
39
40#define MLOG_MASK_PREFIX ML_INODE
41#include <cluster/masklog.h>
42
43#include "ocfs2.h"
44
45#include "alloc.h"
46#include "aops.h"
47#include "dir.h"
48#include "dlmglue.h"
49#include "extent_map.h"
50#include "file.h"
51#include "sysfile.h"
52#include "inode.h"
53#include "ioctl.h"
54#include "journal.h"
55#include "locks.h"
56#include "mmap.h"
57#include "suballoc.h"
58#include "super.h"
59#include "xattr.h"
60#include "acl.h"
61#include "quota.h"
62#include "refcounttree.h"
63
64#include "buffer_head_io.h"
65
66static int ocfs2_sync_inode(struct inode *inode)
67{
68    filemap_fdatawrite(inode->i_mapping);
69    return sync_mapping_buffers(inode->i_mapping);
70}
71
72static int ocfs2_init_file_private(struct inode *inode, struct file *file)
73{
74    struct ocfs2_file_private *fp;
75
76    fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
77    if (!fp)
78        return -ENOMEM;
79
80    fp->fp_file = file;
81    mutex_init(&fp->fp_mutex);
82    ocfs2_file_lock_res_init(&fp->fp_flock, fp);
83    file->private_data = fp;
84
85    return 0;
86}
87
88static void ocfs2_free_file_private(struct inode *inode, struct file *file)
89{
90    struct ocfs2_file_private *fp = file->private_data;
91    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
92
93    if (fp) {
94        ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
95        ocfs2_lock_res_free(&fp->fp_flock);
96        kfree(fp);
97        file->private_data = NULL;
98    }
99}
100
101static int ocfs2_file_open(struct inode *inode, struct file *file)
102{
103    int status;
104    int mode = file->f_flags;
105    struct ocfs2_inode_info *oi = OCFS2_I(inode);
106
107    mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
108           file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
109
110    if (file->f_mode & FMODE_WRITE)
111        dquot_initialize(inode);
112
113    spin_lock(&oi->ip_lock);
114
115    /* Check that the inode hasn't been wiped from disk by another
116     * node. If it hasn't then we're safe as long as we hold the
117     * spin lock until our increment of open count. */
118    if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
119        spin_unlock(&oi->ip_lock);
120
121        status = -ENOENT;
122        goto leave;
123    }
124
125    if (mode & O_DIRECT)
126        oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
127
128    oi->ip_open_count++;
129    spin_unlock(&oi->ip_lock);
130
131    status = ocfs2_init_file_private(inode, file);
132    if (status) {
133        /*
134         * We want to set open count back if we're failing the
135         * open.
136         */
137        spin_lock(&oi->ip_lock);
138        oi->ip_open_count--;
139        spin_unlock(&oi->ip_lock);
140    }
141
142leave:
143    mlog_exit(status);
144    return status;
145}
146
147static int ocfs2_file_release(struct inode *inode, struct file *file)
148{
149    struct ocfs2_inode_info *oi = OCFS2_I(inode);
150
151    mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
152               file->f_path.dentry->d_name.len,
153               file->f_path.dentry->d_name.name);
154
155    spin_lock(&oi->ip_lock);
156    if (!--oi->ip_open_count)
157        oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
158    spin_unlock(&oi->ip_lock);
159
160    ocfs2_free_file_private(inode, file);
161
162    mlog_exit(0);
163
164    return 0;
165}
166
167static int ocfs2_dir_open(struct inode *inode, struct file *file)
168{
169    return ocfs2_init_file_private(inode, file);
170}
171
172static int ocfs2_dir_release(struct inode *inode, struct file *file)
173{
174    ocfs2_free_file_private(inode, file);
175    return 0;
176}
177
178static int ocfs2_sync_file(struct file *file,
179               struct dentry *dentry,
180               int datasync)
181{
182    int err = 0;
183    journal_t *journal;
184    struct inode *inode = dentry->d_inode;
185    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186
187    mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
188           dentry->d_name.len, dentry->d_name.name);
189
190    err = ocfs2_sync_inode(dentry->d_inode);
191    if (err)
192        goto bail;
193
194    if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
195        goto bail;
196
197    journal = osb->journal->j_journal;
198    err = jbd2_journal_force_commit(journal);
199
200bail:
201    mlog_exit(err);
202
203    return (err < 0) ? -EIO : 0;
204}
205
206int ocfs2_should_update_atime(struct inode *inode,
207                  struct vfsmount *vfsmnt)
208{
209    struct timespec now;
210    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
211
212    if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
213        return 0;
214
215    if ((inode->i_flags & S_NOATIME) ||
216        ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
217        return 0;
218
219    /*
220     * We can be called with no vfsmnt structure - NFSD will
221     * sometimes do this.
222     *
223     * Note that our action here is different than touch_atime() -
224     * if we can't tell whether this is a noatime mount, then we
225     * don't know whether to trust the value of s_atime_quantum.
226     */
227    if (vfsmnt == NULL)
228        return 0;
229
230    if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
231        ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
232        return 0;
233
234    if (vfsmnt->mnt_flags & MNT_RELATIME) {
235        if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
236            (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
237            return 1;
238
239        return 0;
240    }
241
242    now = CURRENT_TIME;
243    if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
244        return 0;
245    else
246        return 1;
247}
248
249int ocfs2_update_inode_atime(struct inode *inode,
250                 struct buffer_head *bh)
251{
252    int ret;
253    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
254    handle_t *handle;
255    struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
256
257    mlog_entry_void();
258
259    handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
260    if (IS_ERR(handle)) {
261        ret = PTR_ERR(handle);
262        mlog_errno(ret);
263        goto out;
264    }
265
266    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
267                      OCFS2_JOURNAL_ACCESS_WRITE);
268    if (ret) {
269        mlog_errno(ret);
270        goto out_commit;
271    }
272
273    /*
274     * Don't use ocfs2_mark_inode_dirty() here as we don't always
275     * have i_mutex to guard against concurrent changes to other
276     * inode fields.
277     */
278    inode->i_atime = CURRENT_TIME;
279    di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
280    di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
281
282    ret = ocfs2_journal_dirty(handle, bh);
283    if (ret < 0)
284        mlog_errno(ret);
285
286out_commit:
287    ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
288out:
289    mlog_exit(ret);
290    return ret;
291}
292
293static int ocfs2_set_inode_size(handle_t *handle,
294                struct inode *inode,
295                struct buffer_head *fe_bh,
296                u64 new_i_size)
297{
298    int status;
299
300    mlog_entry_void();
301    i_size_write(inode, new_i_size);
302    inode->i_blocks = ocfs2_inode_sector_count(inode);
303    inode->i_ctime = inode->i_mtime = CURRENT_TIME;
304
305    status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
306    if (status < 0) {
307        mlog_errno(status);
308        goto bail;
309    }
310
311bail:
312    mlog_exit(status);
313    return status;
314}
315
316int ocfs2_simple_size_update(struct inode *inode,
317                 struct buffer_head *di_bh,
318                 u64 new_i_size)
319{
320    int ret;
321    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
322    handle_t *handle = NULL;
323
324    handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
325    if (IS_ERR(handle)) {
326        ret = PTR_ERR(handle);
327        mlog_errno(ret);
328        goto out;
329    }
330
331    ret = ocfs2_set_inode_size(handle, inode, di_bh,
332                   new_i_size);
333    if (ret < 0)
334        mlog_errno(ret);
335
336    ocfs2_commit_trans(osb, handle);
337out:
338    return ret;
339}
340
341static int ocfs2_cow_file_pos(struct inode *inode,
342                  struct buffer_head *fe_bh,
343                  u64 offset)
344{
345    int status;
346    u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
347    unsigned int num_clusters = 0;
348    unsigned int ext_flags = 0;
349
350    /*
351     * If the new offset is aligned to the range of the cluster, there is
352     * no space for ocfs2_zero_range_for_truncate to fill, so no need to
353     * CoW either.
354     */
355    if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
356        return 0;
357
358    status = ocfs2_get_clusters(inode, cpos, &phys,
359                    &num_clusters, &ext_flags);
360    if (status) {
361        mlog_errno(status);
362        goto out;
363    }
364
365    if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
366        goto out;
367
368    return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
369
370out:
371    return status;
372}
373
374static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
375                     struct inode *inode,
376                     struct buffer_head *fe_bh,
377                     u64 new_i_size)
378{
379    int status;
380    handle_t *handle;
381    struct ocfs2_dinode *di;
382    u64 cluster_bytes;
383
384    mlog_entry_void();
385
386    /*
387     * We need to CoW the cluster contains the offset if it is reflinked
388     * since we will call ocfs2_zero_range_for_truncate later which will
389     * write "0" from offset to the end of the cluster.
390     */
391    status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
392    if (status) {
393        mlog_errno(status);
394        return status;
395    }
396
397    /* TODO: This needs to actually orphan the inode in this
398     * transaction. */
399
400    handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
401    if (IS_ERR(handle)) {
402        status = PTR_ERR(handle);
403        mlog_errno(status);
404        goto out;
405    }
406
407    status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
408                     OCFS2_JOURNAL_ACCESS_WRITE);
409    if (status < 0) {
410        mlog_errno(status);
411        goto out_commit;
412    }
413
414    /*
415     * Do this before setting i_size.
416     */
417    cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
418    status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
419                           cluster_bytes);
420    if (status) {
421        mlog_errno(status);
422        goto out_commit;
423    }
424
425    i_size_write(inode, new_i_size);
426    inode->i_ctime = inode->i_mtime = CURRENT_TIME;
427
428    di = (struct ocfs2_dinode *) fe_bh->b_data;
429    di->i_size = cpu_to_le64(new_i_size);
430    di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
431    di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
432
433    status = ocfs2_journal_dirty(handle, fe_bh);
434    if (status < 0)
435        mlog_errno(status);
436
437out_commit:
438    ocfs2_commit_trans(osb, handle);
439out:
440
441    mlog_exit(status);
442    return status;
443}
444
445static int ocfs2_truncate_file(struct inode *inode,
446                   struct buffer_head *di_bh,
447                   u64 new_i_size)
448{
449    int status = 0;
450    struct ocfs2_dinode *fe = NULL;
451    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
452    struct ocfs2_truncate_context *tc = NULL;
453
454    mlog_entry("(inode = %llu, new_i_size = %llu\n",
455           (unsigned long long)OCFS2_I(inode)->ip_blkno,
456           (unsigned long long)new_i_size);
457
458    /* We trust di_bh because it comes from ocfs2_inode_lock(), which
459     * already validated it */
460    fe = (struct ocfs2_dinode *) di_bh->b_data;
461
462    mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
463            "Inode %llu, inode i_size = %lld != di "
464            "i_size = %llu, i_flags = 0x%x\n",
465            (unsigned long long)OCFS2_I(inode)->ip_blkno,
466            i_size_read(inode),
467            (unsigned long long)le64_to_cpu(fe->i_size),
468            le32_to_cpu(fe->i_flags));
469
470    if (new_i_size > le64_to_cpu(fe->i_size)) {
471        mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
472             (unsigned long long)le64_to_cpu(fe->i_size),
473             (unsigned long long)new_i_size);
474        status = -EINVAL;
475        mlog_errno(status);
476        goto bail;
477    }
478
479    mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
480         (unsigned long long)le64_to_cpu(fe->i_blkno),
481         (unsigned long long)le64_to_cpu(fe->i_size),
482         (unsigned long long)new_i_size);
483
484    /* lets handle the simple truncate cases before doing any more
485     * cluster locking. */
486    if (new_i_size == le64_to_cpu(fe->i_size))
487        goto bail;
488
489    down_write(&OCFS2_I(inode)->ip_alloc_sem);
490
491    /*
492     * The inode lock forced other nodes to sync and drop their
493     * pages, which (correctly) happens even if we have a truncate
494     * without allocation change - ocfs2 cluster sizes can be much
495     * greater than page size, so we have to truncate them
496     * anyway.
497     */
498    unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
499    truncate_inode_pages(inode->i_mapping, new_i_size);
500
501    if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
502        status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
503                           i_size_read(inode), 1);
504        if (status)
505            mlog_errno(status);
506
507        goto bail_unlock_sem;
508    }
509
510    /* alright, we're going to need to do a full blown alloc size
511     * change. Orphan the inode so that recovery can complete the
512     * truncate if necessary. This does the task of marking
513     * i_size. */
514    status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
515    if (status < 0) {
516        mlog_errno(status);
517        goto bail_unlock_sem;
518    }
519
520    status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
521    if (status < 0) {
522        mlog_errno(status);
523        goto bail_unlock_sem;
524    }
525
526    status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
527    if (status < 0) {
528        mlog_errno(status);
529        goto bail_unlock_sem;
530    }
531
532    /* TODO: orphan dir cleanup here. */
533bail_unlock_sem:
534    up_write(&OCFS2_I(inode)->ip_alloc_sem);
535
536bail:
537    if (!status && OCFS2_I(inode)->ip_clusters == 0)
538        status = ocfs2_try_remove_refcount_tree(inode, di_bh);
539
540    mlog_exit(status);
541    return status;
542}
543
544/*
545 * extend file allocation only here.
546 * we'll update all the disk stuff, and oip->alloc_size
547 *
548 * expect stuff to be locked, a transaction started and enough data /
549 * metadata reservations in the contexts.
550 *
551 * Will return -EAGAIN, and a reason if a restart is needed.
552 * If passed in, *reason will always be set, even in error.
553 */
554int ocfs2_add_inode_data(struct ocfs2_super *osb,
555             struct inode *inode,
556             u32 *logical_offset,
557             u32 clusters_to_add,
558             int mark_unwritten,
559             struct buffer_head *fe_bh,
560             handle_t *handle,
561             struct ocfs2_alloc_context *data_ac,
562             struct ocfs2_alloc_context *meta_ac,
563             enum ocfs2_alloc_restarted *reason_ret)
564{
565    int ret;
566    struct ocfs2_extent_tree et;
567
568    ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
569    ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
570                      clusters_to_add, mark_unwritten,
571                      data_ac, meta_ac, reason_ret);
572
573    return ret;
574}
575
576static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
577                     u32 clusters_to_add, int mark_unwritten)
578{
579    int status = 0;
580    int restart_func = 0;
581    int credits;
582    u32 prev_clusters;
583    struct buffer_head *bh = NULL;
584    struct ocfs2_dinode *fe = NULL;
585    handle_t *handle = NULL;
586    struct ocfs2_alloc_context *data_ac = NULL;
587    struct ocfs2_alloc_context *meta_ac = NULL;
588    enum ocfs2_alloc_restarted why;
589    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
590    struct ocfs2_extent_tree et;
591    int did_quota = 0;
592
593    mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
594
595    /*
596     * This function only exists for file systems which don't
597     * support holes.
598     */
599    BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
600
601    status = ocfs2_read_inode_block(inode, &bh);
602    if (status < 0) {
603        mlog_errno(status);
604        goto leave;
605    }
606    fe = (struct ocfs2_dinode *) bh->b_data;
607
608restart_all:
609    BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
610
611    mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
612         "clusters_to_add = %u\n",
613         (unsigned long long)OCFS2_I(inode)->ip_blkno,
614         (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
615         clusters_to_add);
616    ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
617    status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
618                       &data_ac, &meta_ac);
619    if (status) {
620        mlog_errno(status);
621        goto leave;
622    }
623
624    credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
625                        clusters_to_add);
626    handle = ocfs2_start_trans(osb, credits);
627    if (IS_ERR(handle)) {
628        status = PTR_ERR(handle);
629        handle = NULL;
630        mlog_errno(status);
631        goto leave;
632    }
633
634restarted_transaction:
635    status = dquot_alloc_space_nodirty(inode,
636            ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
637    if (status)
638        goto leave;
639    did_quota = 1;
640
641    /* reserve a write to the file entry early on - that we if we
642     * run out of credits in the allocation path, we can still
643     * update i_size. */
644    status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
645                     OCFS2_JOURNAL_ACCESS_WRITE);
646    if (status < 0) {
647        mlog_errno(status);
648        goto leave;
649    }
650
651    prev_clusters = OCFS2_I(inode)->ip_clusters;
652
653    status = ocfs2_add_inode_data(osb,
654                      inode,
655                      &logical_start,
656                      clusters_to_add,
657                      mark_unwritten,
658                      bh,
659                      handle,
660                      data_ac,
661                      meta_ac,
662                      &why);
663    if ((status < 0) && (status != -EAGAIN)) {
664        if (status != -ENOSPC)
665            mlog_errno(status);
666        goto leave;
667    }
668
669    status = ocfs2_journal_dirty(handle, bh);
670    if (status < 0) {
671        mlog_errno(status);
672        goto leave;
673    }
674
675    spin_lock(&OCFS2_I(inode)->ip_lock);
676    clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
677    spin_unlock(&OCFS2_I(inode)->ip_lock);
678    /* Release unused quota reservation */
679    dquot_free_space(inode,
680            ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
681    did_quota = 0;
682
683    if (why != RESTART_NONE && clusters_to_add) {
684        if (why == RESTART_META) {
685            mlog(0, "restarting function.\n");
686            restart_func = 1;
687        } else {
688            BUG_ON(why != RESTART_TRANS);
689
690            mlog(0, "restarting transaction.\n");
691            /* TODO: This can be more intelligent. */
692            credits = ocfs2_calc_extend_credits(osb->sb,
693                                &fe->id2.i_list,
694                                clusters_to_add);
695            status = ocfs2_extend_trans(handle, credits);
696            if (status < 0) {
697                /* handle still has to be committed at
698                 * this point. */
699                status = -ENOMEM;
700                mlog_errno(status);
701                goto leave;
702            }
703            goto restarted_transaction;
704        }
705    }
706
707    mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
708         le32_to_cpu(fe->i_clusters),
709         (unsigned long long)le64_to_cpu(fe->i_size));
710    mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
711         OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
712
713leave:
714    if (status < 0 && did_quota)
715        dquot_free_space(inode,
716            ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
717    if (handle) {
718        ocfs2_commit_trans(osb, handle);
719        handle = NULL;
720    }
721    if (data_ac) {
722        ocfs2_free_alloc_context(data_ac);
723        data_ac = NULL;
724    }
725    if (meta_ac) {
726        ocfs2_free_alloc_context(meta_ac);
727        meta_ac = NULL;
728    }
729    if ((!status) && restart_func) {
730        restart_func = 0;
731        goto restart_all;
732    }
733    brelse(bh);
734    bh = NULL;
735
736    mlog_exit(status);
737    return status;
738}
739
740/* Some parts of this taken from generic_cont_expand, which turned out
741 * to be too fragile to do exactly what we need without us having to
742 * worry about recursive locking in ->write_begin() and ->write_end(). */
743static int ocfs2_write_zero_page(struct inode *inode,
744                 u64 size)
745{
746    struct address_space *mapping = inode->i_mapping;
747    struct page *page;
748    unsigned long index;
749    unsigned int offset;
750    handle_t *handle = NULL;
751    int ret;
752
753    offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
754    /* ugh. in prepare/commit_write, if from==to==start of block, we
755    ** skip the prepare. make sure we never send an offset for the start
756    ** of a block
757    */
758    if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
759        offset++;
760    }
761    index = size >> PAGE_CACHE_SHIFT;
762
763    page = grab_cache_page(mapping, index);
764    if (!page) {
765        ret = -ENOMEM;
766        mlog_errno(ret);
767        goto out;
768    }
769
770    ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
771    if (ret < 0) {
772        mlog_errno(ret);
773        goto out_unlock;
774    }
775
776    if (ocfs2_should_order_data(inode)) {
777        handle = ocfs2_start_walk_page_trans(inode, page, offset,
778                             offset);
779        if (IS_ERR(handle)) {
780            ret = PTR_ERR(handle);
781            handle = NULL;
782            goto out_unlock;
783        }
784    }
785
786    /* must not update i_size! */
787    ret = block_commit_write(page, offset, offset);
788    if (ret < 0)
789        mlog_errno(ret);
790    else
791        ret = 0;
792
793    if (handle)
794        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
795out_unlock:
796    unlock_page(page);
797    page_cache_release(page);
798out:
799    return ret;
800}
801
802static int ocfs2_zero_extend(struct inode *inode,
803                 u64 zero_to_size)
804{
805    int ret = 0;
806    u64 start_off;
807    struct super_block *sb = inode->i_sb;
808
809    start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
810    while (start_off < zero_to_size) {
811        ret = ocfs2_write_zero_page(inode, start_off);
812        if (ret < 0) {
813            mlog_errno(ret);
814            goto out;
815        }
816
817        start_off += sb->s_blocksize;
818
819        /*
820         * Very large extends have the potential to lock up
821         * the cpu for extended periods of time.
822         */
823        cond_resched();
824    }
825
826out:
827    return ret;
828}
829
830int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
831{
832    int ret;
833    u32 clusters_to_add;
834    struct ocfs2_inode_info *oi = OCFS2_I(inode);
835
836    clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
837    if (clusters_to_add < oi->ip_clusters)
838        clusters_to_add = 0;
839    else
840        clusters_to_add -= oi->ip_clusters;
841
842    if (clusters_to_add) {
843        ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
844                        clusters_to_add, 0);
845        if (ret) {
846            mlog_errno(ret);
847            goto out;
848        }
849    }
850
851    /*
852     * Call this even if we don't add any clusters to the tree. We
853     * still need to zero the area between the old i_size and the
854     * new i_size.
855     */
856    ret = ocfs2_zero_extend(inode, zero_to);
857    if (ret < 0)
858        mlog_errno(ret);
859
860out:
861    return ret;
862}
863
864static int ocfs2_extend_file(struct inode *inode,
865                 struct buffer_head *di_bh,
866                 u64 new_i_size)
867{
868    int ret = 0;
869    struct ocfs2_inode_info *oi = OCFS2_I(inode);
870
871    BUG_ON(!di_bh);
872
873    /* setattr sometimes calls us like this. */
874    if (new_i_size == 0)
875        goto out;
876
877    if (i_size_read(inode) == new_i_size)
878          goto out;
879    BUG_ON(new_i_size < i_size_read(inode));
880
881    /*
882     * Fall through for converting inline data, even if the fs
883     * supports sparse files.
884     *
885     * The check for inline data here is legal - nobody can add
886     * the feature since we have i_mutex. We must check it again
887     * after acquiring ip_alloc_sem though, as paths like mmap
888     * might have raced us to converting the inode to extents.
889     */
890    if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
891        && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
892        goto out_update_size;
893
894    /*
895     * The alloc sem blocks people in read/write from reading our
896     * allocation until we're done changing it. We depend on
897     * i_mutex to block other extend/truncate calls while we're
898     * here.
899     */
900    down_write(&oi->ip_alloc_sem);
901
902    if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
903        /*
904         * We can optimize small extends by keeping the inodes
905         * inline data.
906         */
907        if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
908            up_write(&oi->ip_alloc_sem);
909            goto out_update_size;
910        }
911
912        ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
913        if (ret) {
914            up_write(&oi->ip_alloc_sem);
915
916            mlog_errno(ret);
917            goto out;
918        }
919    }
920
921    if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
922        ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
923
924    up_write(&oi->ip_alloc_sem);
925
926    if (ret < 0) {
927        mlog_errno(ret);
928        goto out;
929    }
930
931out_update_size:
932    ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
933    if (ret < 0)
934        mlog_errno(ret);
935
936out:
937    return ret;
938}
939
940int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
941{
942    int status = 0, size_change;
943    struct inode *inode = dentry->d_inode;
944    struct super_block *sb = inode->i_sb;
945    struct ocfs2_super *osb = OCFS2_SB(sb);
946    struct buffer_head *bh = NULL;
947    handle_t *handle = NULL;
948    int qtype;
949    struct dquot *transfer_from[MAXQUOTAS] = { };
950    struct dquot *transfer_to[MAXQUOTAS] = { };
951
952    mlog_entry("(0x%p, '%.*s')\n", dentry,
953               dentry->d_name.len, dentry->d_name.name);
954
955    /* ensuring we don't even attempt to truncate a symlink */
956    if (S_ISLNK(inode->i_mode))
957        attr->ia_valid &= ~ATTR_SIZE;
958
959    if (attr->ia_valid & ATTR_MODE)
960        mlog(0, "mode change: %d\n", attr->ia_mode);
961    if (attr->ia_valid & ATTR_UID)
962        mlog(0, "uid change: %d\n", attr->ia_uid);
963    if (attr->ia_valid & ATTR_GID)
964        mlog(0, "gid change: %d\n", attr->ia_gid);
965    if (attr->ia_valid & ATTR_SIZE)
966        mlog(0, "size change...\n");
967    if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
968        mlog(0, "time change...\n");
969
970#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
971               | ATTR_GID | ATTR_UID | ATTR_MODE)
972    if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
973        mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
974        return 0;
975    }
976
977    status = inode_change_ok(inode, attr);
978    if (status)
979        return status;
980
981    size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
982    if (size_change) {
983        dquot_initialize(inode);
984
985        status = ocfs2_rw_lock(inode, 1);
986        if (status < 0) {
987            mlog_errno(status);
988            goto bail;
989        }
990    }
991
992    status = ocfs2_inode_lock(inode, &bh, 1);
993    if (status < 0) {
994        if (status != -ENOENT)
995            mlog_errno(status);
996        goto bail_unlock_rw;
997    }
998
999    if (size_change && attr->ia_size != i_size_read(inode)) {
1000        status = inode_newsize_ok(inode, attr->ia_size);
1001        if (status)
1002            goto bail_unlock;
1003
1004        if (i_size_read(inode) > attr->ia_size) {
1005            if (ocfs2_should_order_data(inode)) {
1006                status = ocfs2_begin_ordered_truncate(inode,
1007                                      attr->ia_size);
1008                if (status)
1009                    goto bail_unlock;
1010            }
1011            status = ocfs2_truncate_file(inode, bh, attr->ia_size);
1012        } else
1013            status = ocfs2_extend_file(inode, bh, attr->ia_size);
1014        if (status < 0) {
1015            if (status != -ENOSPC)
1016                mlog_errno(status);
1017            status = -ENOSPC;
1018            goto bail_unlock;
1019        }
1020    }
1021
1022    if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
1023        (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
1024        /*
1025         * Gather pointers to quota structures so that allocation /
1026         * freeing of quota structures happens here and not inside
1027         * dquot_transfer() where we have problems with lock ordering
1028         */
1029        if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
1030            && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1031            OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
1032            transfer_to[USRQUOTA] = dqget(sb, attr->ia_uid,
1033                              USRQUOTA);
1034            transfer_from[USRQUOTA] = dqget(sb, inode->i_uid,
1035                            USRQUOTA);
1036            if (!transfer_to[USRQUOTA] || !transfer_from[USRQUOTA]) {
1037                status = -ESRCH;
1038                goto bail_unlock;
1039            }
1040        }
1041        if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
1042            && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
1043            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
1044            transfer_to[GRPQUOTA] = dqget(sb, attr->ia_gid,
1045                              GRPQUOTA);
1046            transfer_from[GRPQUOTA] = dqget(sb, inode->i_gid,
1047                            GRPQUOTA);
1048            if (!transfer_to[GRPQUOTA] || !transfer_from[GRPQUOTA]) {
1049                status = -ESRCH;
1050                goto bail_unlock;
1051            }
1052        }
1053        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS +
1054                       2 * ocfs2_quota_trans_credits(sb));
1055        if (IS_ERR(handle)) {
1056            status = PTR_ERR(handle);
1057            mlog_errno(status);
1058            goto bail_unlock;
1059        }
1060        status = dquot_transfer(inode, attr);
1061        if (status < 0)
1062            goto bail_commit;
1063    } else {
1064        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1065        if (IS_ERR(handle)) {
1066            status = PTR_ERR(handle);
1067            mlog_errno(status);
1068            goto bail_unlock;
1069        }
1070    }
1071
1072    /*
1073     * This will intentionally not wind up calling vmtruncate(),
1074     * since all the work for a size change has been done above.
1075     * Otherwise, we could get into problems with truncate as
1076     * ip_alloc_sem is used there to protect against i_size
1077     * changes.
1078     */
1079    status = inode_setattr(inode, attr);
1080    if (status < 0) {
1081        mlog_errno(status);
1082        goto bail_commit;
1083    }
1084
1085    status = ocfs2_mark_inode_dirty(handle, inode, bh);
1086    if (status < 0)
1087        mlog_errno(status);
1088
1089bail_commit:
1090    ocfs2_commit_trans(osb, handle);
1091bail_unlock:
1092    ocfs2_inode_unlock(inode, 1);
1093bail_unlock_rw:
1094    if (size_change)
1095        ocfs2_rw_unlock(inode, 1);
1096bail:
1097    brelse(bh);
1098
1099    /* Release quota pointers in case we acquired them */
1100    for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
1101        dqput(transfer_to[qtype]);
1102        dqput(transfer_from[qtype]);
1103    }
1104
1105    if (!status && attr->ia_valid & ATTR_MODE) {
1106        status = ocfs2_acl_chmod(inode);
1107        if (status < 0)
1108            mlog_errno(status);
1109    }
1110
1111    mlog_exit(status);
1112    return status;
1113}
1114
1115int ocfs2_getattr(struct vfsmount *mnt,
1116          struct dentry *dentry,
1117          struct kstat *stat)
1118{
1119    struct inode *inode = dentry->d_inode;
1120    struct super_block *sb = dentry->d_inode->i_sb;
1121    struct ocfs2_super *osb = sb->s_fs_info;
1122    int err;
1123
1124    mlog_entry_void();
1125
1126    err = ocfs2_inode_revalidate(dentry);
1127    if (err) {
1128        if (err != -ENOENT)
1129            mlog_errno(err);
1130        goto bail;
1131    }
1132
1133    generic_fillattr(inode, stat);
1134
1135    /* We set the blksize from the cluster size for performance */
1136    stat->blksize = osb->s_clustersize;
1137
1138bail:
1139    mlog_exit(err);
1140
1141    return err;
1142}
1143
1144int ocfs2_permission(struct inode *inode, int mask)
1145{
1146    int ret;
1147
1148    mlog_entry_void();
1149
1150    ret = ocfs2_inode_lock(inode, NULL, 0);
1151    if (ret) {
1152        if (ret != -ENOENT)
1153            mlog_errno(ret);
1154        goto out;
1155    }
1156
1157    ret = generic_permission(inode, mask, ocfs2_check_acl);
1158
1159    ocfs2_inode_unlock(inode, 0);
1160out:
1161    mlog_exit(ret);
1162    return ret;
1163}
1164
1165static int __ocfs2_write_remove_suid(struct inode *inode,
1166                     struct buffer_head *bh)
1167{
1168    int ret;
1169    handle_t *handle;
1170    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1171    struct ocfs2_dinode *di;
1172
1173    mlog_entry("(Inode %llu, mode 0%o)\n",
1174           (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1175
1176    handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1177    if (IS_ERR(handle)) {
1178        ret = PTR_ERR(handle);
1179        mlog_errno(ret);
1180        goto out;
1181    }
1182
1183    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1184                      OCFS2_JOURNAL_ACCESS_WRITE);
1185    if (ret < 0) {
1186        mlog_errno(ret);
1187        goto out_trans;
1188    }
1189
1190    inode->i_mode &= ~S_ISUID;
1191    if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1192        inode->i_mode &= ~S_ISGID;
1193
1194    di = (struct ocfs2_dinode *) bh->b_data;
1195    di->i_mode = cpu_to_le16(inode->i_mode);
1196
1197    ret = ocfs2_journal_dirty(handle, bh);
1198    if (ret < 0)
1199        mlog_errno(ret);
1200
1201out_trans:
1202    ocfs2_commit_trans(osb, handle);
1203out:
1204    mlog_exit(ret);
1205    return ret;
1206}
1207
1208/*
1209 * Will look for holes and unwritten extents in the range starting at
1210 * pos for count bytes (inclusive).
1211 */
1212static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1213                       size_t count)
1214{
1215    int ret = 0;
1216    unsigned int extent_flags;
1217    u32 cpos, clusters, extent_len, phys_cpos;
1218    struct super_block *sb = inode->i_sb;
1219
1220    cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1221    clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1222
1223    while (clusters) {
1224        ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1225                     &extent_flags);
1226        if (ret < 0) {
1227            mlog_errno(ret);
1228            goto out;
1229        }
1230
1231        if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1232            ret = 1;
1233            break;
1234        }
1235
1236        if (extent_len > clusters)
1237            extent_len = clusters;
1238
1239        clusters -= extent_len;
1240        cpos += extent_len;
1241    }
1242out:
1243    return ret;
1244}
1245
1246static int ocfs2_write_remove_suid(struct inode *inode)
1247{
1248    int ret;
1249    struct buffer_head *bh = NULL;
1250
1251    ret = ocfs2_read_inode_block(inode, &bh);
1252    if (ret < 0) {
1253        mlog_errno(ret);
1254        goto out;
1255    }
1256
1257    ret = __ocfs2_write_remove_suid(inode, bh);
1258out:
1259    brelse(bh);
1260    return ret;
1261}
1262
1263/*
1264 * Allocate enough extents to cover the region starting at byte offset
1265 * start for len bytes. Existing extents are skipped, any extents
1266 * added are marked as "unwritten".
1267 */
1268static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1269                        u64 start, u64 len)
1270{
1271    int ret;
1272    u32 cpos, phys_cpos, clusters, alloc_size;
1273    u64 end = start + len;
1274    struct buffer_head *di_bh = NULL;
1275
1276    if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1277        ret = ocfs2_read_inode_block(inode, &di_bh);
1278        if (ret) {
1279            mlog_errno(ret);
1280            goto out;
1281        }
1282
1283        /*
1284         * Nothing to do if the requested reservation range
1285         * fits within the inode.
1286         */
1287        if (ocfs2_size_fits_inline_data(di_bh, end))
1288            goto out;
1289
1290        ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
1291        if (ret) {
1292            mlog_errno(ret);
1293            goto out;
1294        }
1295    }
1296
1297    /*
1298     * We consider both start and len to be inclusive.
1299     */
1300    cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1301    clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1302    clusters -= cpos;
1303
1304    while (clusters) {
1305        ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1306                     &alloc_size, NULL);
1307        if (ret) {
1308            mlog_errno(ret);
1309            goto out;
1310        }
1311
1312        /*
1313         * Hole or existing extent len can be arbitrary, so
1314         * cap it to our own allocation request.
1315         */
1316        if (alloc_size > clusters)
1317            alloc_size = clusters;
1318
1319        if (phys_cpos) {
1320            /*
1321             * We already have an allocation at this
1322             * region so we can safely skip it.
1323             */
1324            goto next;
1325        }
1326
1327        ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1328        if (ret) {
1329            if (ret != -ENOSPC)
1330                mlog_errno(ret);
1331            goto out;
1332        }
1333
1334next:
1335        cpos += alloc_size;
1336        clusters -= alloc_size;
1337    }
1338
1339    ret = 0;
1340out:
1341
1342    brelse(di_bh);
1343    return ret;
1344}
1345
1346/*
1347 * Truncate a byte range, avoiding pages within partial clusters. This
1348 * preserves those pages for the zeroing code to write to.
1349 */
1350static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1351                     u64 byte_len)
1352{
1353    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1354    loff_t start, end;
1355    struct address_space *mapping = inode->i_mapping;
1356
1357    start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1358    end = byte_start + byte_len;
1359    end = end & ~(osb->s_clustersize - 1);
1360
1361    if (start < end) {
1362        unmap_mapping_range(mapping, start, end - start, 0);
1363        truncate_inode_pages_range(mapping, start, end - 1);
1364    }
1365}
1366
1367static int ocfs2_zero_partial_clusters(struct inode *inode,
1368                       u64 start, u64 len)
1369{
1370    int ret = 0;
1371    u64 tmpend, end = start + len;
1372    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1373    unsigned int csize = osb->s_clustersize;
1374    handle_t *handle;
1375
1376    /*
1377     * The "start" and "end" values are NOT necessarily part of
1378     * the range whose allocation is being deleted. Rather, this
1379     * is what the user passed in with the request. We must zero
1380     * partial clusters here. There's no need to worry about
1381     * physical allocation - the zeroing code knows to skip holes.
1382     */
1383    mlog(0, "byte start: %llu, end: %llu\n",
1384         (unsigned long long)start, (unsigned long long)end);
1385
1386    /*
1387     * If both edges are on a cluster boundary then there's no
1388     * zeroing required as the region is part of the allocation to
1389     * be truncated.
1390     */
1391    if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1392        goto out;
1393
1394    handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1395    if (IS_ERR(handle)) {
1396        ret = PTR_ERR(handle);
1397        mlog_errno(ret);
1398        goto out;
1399    }
1400
1401    /*
1402     * We want to get the byte offset of the end of the 1st cluster.
1403     */
1404    tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1405    if (tmpend > end)
1406        tmpend = end;
1407
1408    mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1409         (unsigned long long)start, (unsigned long long)tmpend);
1410
1411    ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1412    if (ret)
1413        mlog_errno(ret);
1414
1415    if (tmpend < end) {
1416        /*
1417         * This may make start and end equal, but the zeroing
1418         * code will skip any work in that case so there's no
1419         * need to catch it up here.
1420         */
1421        start = end & ~(osb->s_clustersize - 1);
1422
1423        mlog(0, "2nd range: start: %llu, end: %llu\n",
1424             (unsigned long long)start, (unsigned long long)end);
1425
1426        ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1427        if (ret)
1428            mlog_errno(ret);
1429    }
1430
1431    ocfs2_commit_trans(osb, handle);
1432out:
1433    return ret;
1434}
1435
1436static int ocfs2_remove_inode_range(struct inode *inode,
1437                    struct buffer_head *di_bh, u64 byte_start,
1438                    u64 byte_len)
1439{
1440    int ret = 0;
1441    u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1442    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1443    struct ocfs2_cached_dealloc_ctxt dealloc;
1444    struct address_space *mapping = inode->i_mapping;
1445    struct ocfs2_extent_tree et;
1446
1447    ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1448    ocfs2_init_dealloc_ctxt(&dealloc);
1449
1450    if (byte_len == 0)
1451        return 0;
1452
1453    if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1454        ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
1455                        byte_start + byte_len, 0);
1456        if (ret) {
1457            mlog_errno(ret);
1458            goto out;
1459        }
1460        /*
1461         * There's no need to get fancy with the page cache
1462         * truncate of an inline-data inode. We're talking
1463         * about less than a page here, which will be cached
1464         * in the dinode buffer anyway.
1465         */
1466        unmap_mapping_range(mapping, 0, 0, 0);
1467        truncate_inode_pages(mapping, 0);
1468        goto out;
1469    }
1470
1471    trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1472    trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1473    if (trunc_len >= trunc_start)
1474        trunc_len -= trunc_start;
1475    else
1476        trunc_len = 0;
1477
1478    mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1479         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1480         (unsigned long long)byte_start,
1481         (unsigned long long)byte_len, trunc_start, trunc_len);
1482
1483    ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1484    if (ret) {
1485        mlog_errno(ret);
1486        goto out;
1487    }
1488
1489    cpos = trunc_start;
1490    while (trunc_len) {
1491        ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1492                     &alloc_size, NULL);
1493        if (ret) {
1494            mlog_errno(ret);
1495            goto out;
1496        }
1497
1498        if (alloc_size > trunc_len)
1499            alloc_size = trunc_len;
1500
1501        /* Only do work for non-holes */
1502        if (phys_cpos != 0) {
1503            ret = ocfs2_remove_btree_range(inode, &et, cpos,
1504                               phys_cpos, alloc_size,
1505                               &dealloc);
1506            if (ret) {
1507                mlog_errno(ret);
1508                goto out;
1509            }
1510        }
1511
1512        cpos += alloc_size;
1513        trunc_len -= alloc_size;
1514    }
1515
1516    ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1517
1518out:
1519    ocfs2_schedule_truncate_log_flush(osb, 1);
1520    ocfs2_run_deallocs(osb, &dealloc);
1521
1522    return ret;
1523}
1524
1525/*
1526 * Parts of this function taken from xfs_change_file_space()
1527 */
1528static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
1529                     loff_t f_pos, unsigned int cmd,
1530                     struct ocfs2_space_resv *sr,
1531                     int change_size)
1532{
1533    int ret;
1534    s64 llen;
1535    loff_t size;
1536    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1537    struct buffer_head *di_bh = NULL;
1538    handle_t *handle;
1539    unsigned long long max_off = inode->i_sb->s_maxbytes;
1540
1541    if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1542        return -EROFS;
1543
1544    mutex_lock(&inode->i_mutex);
1545
1546    /*
1547     * This prevents concurrent writes on other nodes
1548     */
1549    ret = ocfs2_rw_lock(inode, 1);
1550    if (ret) {
1551        mlog_errno(ret);
1552        goto out;
1553    }
1554
1555    ret = ocfs2_inode_lock(inode, &di_bh, 1);
1556    if (ret) {
1557        mlog_errno(ret);
1558        goto out_rw_unlock;
1559    }
1560
1561    if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1562        ret = -EPERM;
1563        goto out_inode_unlock;
1564    }
1565
1566    switch (sr->l_whence) {
1567    case 0: /*SEEK_SET*/
1568        break;
1569    case 1: /*SEEK_CUR*/
1570        sr->l_start += f_pos;
1571        break;
1572    case 2: /*SEEK_END*/
1573        sr->l_start += i_size_read(inode);
1574        break;
1575    default:
1576        ret = -EINVAL;
1577        goto out_inode_unlock;
1578    }
1579    sr->l_whence = 0;
1580
1581    llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1582
1583    if (sr->l_start < 0
1584        || sr->l_start > max_off
1585        || (sr->l_start + llen) < 0
1586        || (sr->l_start + llen) > max_off) {
1587        ret = -EINVAL;
1588        goto out_inode_unlock;
1589    }
1590    size = sr->l_start + sr->l_len;
1591
1592    if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1593        if (sr->l_len <= 0) {
1594            ret = -EINVAL;
1595            goto out_inode_unlock;
1596        }
1597    }
1598
1599    if (file && should_remove_suid(file->f_path.dentry)) {
1600        ret = __ocfs2_write_remove_suid(inode, di_bh);
1601        if (ret) {
1602            mlog_errno(ret);
1603            goto out_inode_unlock;
1604        }
1605    }
1606
1607    down_write(&OCFS2_I(inode)->ip_alloc_sem);
1608    switch (cmd) {
1609    case OCFS2_IOC_RESVSP:
1610    case OCFS2_IOC_RESVSP64:
1611        /*
1612         * This takes unsigned offsets, but the signed ones we
1613         * pass have been checked against overflow above.
1614         */
1615        ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1616                               sr->l_len);
1617        break;
1618    case OCFS2_IOC_UNRESVSP:
1619    case OCFS2_IOC_UNRESVSP64:
1620        ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1621                           sr->l_len);
1622        break;
1623    default:
1624        ret = -EINVAL;
1625    }
1626    up_write(&OCFS2_I(inode)->ip_alloc_sem);
1627    if (ret) {
1628        mlog_errno(ret);
1629        goto out_inode_unlock;
1630    }
1631
1632    /*
1633     * We update c/mtime for these changes
1634     */
1635    handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1636    if (IS_ERR(handle)) {
1637        ret = PTR_ERR(handle);
1638        mlog_errno(ret);
1639        goto out_inode_unlock;
1640    }
1641
1642    if (change_size && i_size_read(inode) < size)
1643        i_size_write(inode, size);
1644
1645    inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1646    ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1647    if (ret < 0)
1648        mlog_errno(ret);
1649
1650    ocfs2_commit_trans(osb, handle);
1651
1652out_inode_unlock:
1653    brelse(di_bh);
1654    ocfs2_inode_unlock(inode, 1);
1655out_rw_unlock:
1656    ocfs2_rw_unlock(inode, 1);
1657
1658out:
1659    mutex_unlock(&inode->i_mutex);
1660    return ret;
1661}
1662
1663int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1664                struct ocfs2_space_resv *sr)
1665{
1666    struct inode *inode = file->f_path.dentry->d_inode;
1667    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1668
1669    if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1670        !ocfs2_writes_unwritten_extents(osb))
1671        return -ENOTTY;
1672    else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1673         !ocfs2_sparse_alloc(osb))
1674        return -ENOTTY;
1675
1676    if (!S_ISREG(inode->i_mode))
1677        return -EINVAL;
1678
1679    if (!(file->f_mode & FMODE_WRITE))
1680        return -EBADF;
1681
1682    return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
1683}
1684
1685static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1686                loff_t len)
1687{
1688    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1689    struct ocfs2_space_resv sr;
1690    int change_size = 1;
1691
1692    if (!ocfs2_writes_unwritten_extents(osb))
1693        return -EOPNOTSUPP;
1694
1695    if (S_ISDIR(inode->i_mode))
1696        return -ENODEV;
1697
1698    if (mode & FALLOC_FL_KEEP_SIZE)
1699        change_size = 0;
1700
1701    sr.l_whence = 0;
1702    sr.l_start = (s64)offset;
1703    sr.l_len = (s64)len;
1704
1705    return __ocfs2_change_file_space(NULL, inode, offset,
1706                     OCFS2_IOC_RESVSP64, &sr, change_size);
1707}
1708
1709int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
1710                   size_t count)
1711{
1712    int ret = 0;
1713    unsigned int extent_flags;
1714    u32 cpos, clusters, extent_len, phys_cpos;
1715    struct super_block *sb = inode->i_sb;
1716
1717    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
1718        !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
1719        OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1720        return 0;
1721
1722    cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1723    clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1724
1725    while (clusters) {
1726        ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1727                     &extent_flags);
1728        if (ret < 0) {
1729            mlog_errno(ret);
1730            goto out;
1731        }
1732
1733        if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
1734            ret = 1;
1735            break;
1736        }
1737
1738        if (extent_len > clusters)
1739            extent_len = clusters;
1740
1741        clusters -= extent_len;
1742        cpos += extent_len;
1743    }
1744out:
1745    return ret;
1746}
1747
1748static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
1749                        loff_t pos, size_t count,
1750                        int *meta_level)
1751{
1752    int ret;
1753    struct buffer_head *di_bh = NULL;
1754    u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1755    u32 clusters =
1756        ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
1757
1758    ret = ocfs2_inode_lock(inode, &di_bh, 1);
1759    if (ret) {
1760        mlog_errno(ret);
1761        goto out;
1762    }
1763
1764    *meta_level = 1;
1765
1766    ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
1767    if (ret)
1768        mlog_errno(ret);
1769out:
1770    brelse(di_bh);
1771    return ret;
1772}
1773
1774static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1775                     loff_t *ppos,
1776                     size_t count,
1777                     int appending,
1778                     int *direct_io,
1779                     int *has_refcount)
1780{
1781    int ret = 0, meta_level = 0;
1782    struct inode *inode = dentry->d_inode;
1783    loff_t saved_pos, end;
1784
1785    /*
1786     * We start with a read level meta lock and only jump to an ex
1787     * if we need to make modifications here.
1788     */
1789    for(;;) {
1790        ret = ocfs2_inode_lock(inode, NULL, meta_level);
1791        if (ret < 0) {
1792            meta_level = -1;
1793            mlog_errno(ret);
1794            goto out;
1795        }
1796
1797        /* Clear suid / sgid if necessary. We do this here
1798         * instead of later in the write path because
1799         * remove_suid() calls ->setattr without any hint that
1800         * we may have already done our cluster locking. Since
1801         * ocfs2_setattr() *must* take cluster locks to
1802         * proceeed, this will lead us to recursively lock the
1803         * inode. There's also the dinode i_size state which
1804         * can be lost via setattr during extending writes (we
1805         * set inode->i_size at the end of a write. */
1806        if (should_remove_suid(dentry)) {
1807            if (meta_level == 0) {
1808                ocfs2_inode_unlock(inode, meta_level);
1809                meta_level = 1;
1810                continue;
1811            }
1812
1813            ret = ocfs2_write_remove_suid(inode);
1814            if (ret < 0) {
1815                mlog_errno(ret);
1816                goto out_unlock;
1817            }
1818        }
1819
1820        /* work on a copy of ppos until we're sure that we won't have
1821         * to recalculate it due to relocking. */
1822        if (appending) {
1823            saved_pos = i_size_read(inode);
1824            mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1825        } else {
1826            saved_pos = *ppos;
1827        }
1828
1829        end = saved_pos + count;
1830
1831        ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
1832        if (ret == 1) {
1833            ocfs2_inode_unlock(inode, meta_level);
1834            meta_level = -1;
1835
1836            ret = ocfs2_prepare_inode_for_refcount(inode,
1837                                   saved_pos,
1838                                   count,
1839                                   &meta_level);
1840            if (has_refcount)
1841                *has_refcount = 1;
1842            if (direct_io)
1843                *direct_io = 0;
1844        }
1845
1846        if (ret < 0) {
1847            mlog_errno(ret);
1848            goto out_unlock;
1849        }
1850
1851        /*
1852         * Skip the O_DIRECT checks if we don't need
1853         * them.
1854         */
1855        if (!direct_io || !(*direct_io))
1856            break;
1857
1858        /*
1859         * There's no sane way to do direct writes to an inode
1860         * with inline data.
1861         */
1862        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
1863            *direct_io = 0;
1864            break;
1865        }
1866
1867        /*
1868         * Allowing concurrent direct writes means
1869         * i_size changes wouldn't be synchronized, so
1870         * one node could wind up truncating another
1871         * nodes writes.
1872         */
1873        if (end > i_size_read(inode)) {
1874            *direct_io = 0;
1875            break;
1876        }
1877
1878        /*
1879         * We don't fill holes during direct io, so
1880         * check for them here. If any are found, the
1881         * caller will have to retake some cluster
1882         * locks and initiate the io as buffered.
1883         */
1884        ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
1885        if (ret == 1) {
1886            *direct_io = 0;
1887            ret = 0;
1888        } else if (ret < 0)
1889            mlog_errno(ret);
1890        break;
1891    }
1892
1893    if (appending)
1894        *ppos = saved_pos;
1895
1896out_unlock:
1897    if (meta_level >= 0)
1898        ocfs2_inode_unlock(inode, meta_level);
1899
1900out:
1901    return ret;
1902}
1903
1904static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1905                    const struct iovec *iov,
1906                    unsigned long nr_segs,
1907                    loff_t pos)
1908{
1909    int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1910    int can_do_direct, has_refcount = 0;
1911    ssize_t written = 0;
1912    size_t ocount; /* original count */
1913    size_t count; /* after file limit checks */
1914    loff_t old_size, *ppos = &iocb->ki_pos;
1915    u32 old_clusters;
1916    struct file *file = iocb->ki_filp;
1917    struct inode *inode = file->f_path.dentry->d_inode;
1918    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1919
1920    mlog_entry("(0x%p, %u, '%.*s')\n", file,
1921           (unsigned int)nr_segs,
1922           file->f_path.dentry->d_name.len,
1923           file->f_path.dentry->d_name.name);
1924
1925    if (iocb->ki_left == 0)
1926        return 0;
1927
1928    vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1929
1930    appending = file->f_flags & O_APPEND ? 1 : 0;
1931    direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1932
1933    mutex_lock(&inode->i_mutex);
1934
1935relock:
1936    /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1937    if (direct_io) {
1938        down_read(&inode->i_alloc_sem);
1939        have_alloc_sem = 1;
1940    }
1941
1942    /* concurrent O_DIRECT writes are allowed */
1943    rw_level = !direct_io;
1944    ret = ocfs2_rw_lock(inode, rw_level);
1945    if (ret < 0) {
1946        mlog_errno(ret);
1947        goto out_sems;
1948    }
1949
1950    can_do_direct = direct_io;
1951    ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1952                        iocb->ki_left, appending,
1953                        &can_do_direct, &has_refcount);
1954    if (ret < 0) {
1955        mlog_errno(ret);
1956        goto out;
1957    }
1958
1959    /*
1960     * We can't complete the direct I/O as requested, fall back to
1961     * buffered I/O.
1962     */
1963    if (direct_io && !can_do_direct) {
1964        ocfs2_rw_unlock(inode, rw_level);
1965        up_read(&inode->i_alloc_sem);
1966
1967        have_alloc_sem = 0;
1968        rw_level = -1;
1969
1970        direct_io = 0;
1971        goto relock;
1972    }
1973
1974    /*
1975     * To later detect whether a journal commit for sync writes is
1976     * necessary, we sample i_size, and cluster count here.
1977     */
1978    old_size = i_size_read(inode);
1979    old_clusters = OCFS2_I(inode)->ip_clusters;
1980
1981    /* communicate with ocfs2_dio_end_io */
1982    ocfs2_iocb_set_rw_locked(iocb, rw_level);
1983
1984    if (direct_io) {
1985        ret = generic_segment_checks(iov, &nr_segs, &ocount,
1986                         VERIFY_READ);
1987        if (ret)
1988            goto out_dio;
1989
1990        count = ocount;
1991        ret = generic_write_checks(file, ppos, &count,
1992                       S_ISBLK(inode->i_mode));
1993        if (ret)
1994            goto out_dio;
1995
1996        written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1997                            ppos, count, ocount);
1998        if (written < 0) {
1999            /*
2000             * direct write may have instantiated a few
2001             * blocks outside i_size. Trim these off again.
2002             * Don't need i_size_read because we hold i_mutex.
2003             */
2004            if (*ppos + count > inode->i_size)
2005                vmtruncate(inode, inode->i_size);
2006            ret = written;
2007            goto out_dio;
2008        }
2009    } else {
2010        written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
2011    }
2012
2013out_dio:
2014    /* buffered aio wouldn't have proper lock coverage today */
2015    BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
2016
2017    if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
2018        ((file->f_flags & O_DIRECT) && has_refcount)) {
2019        ret = filemap_fdatawrite_range(file->f_mapping, pos,
2020                           pos + count - 1);
2021        if (ret < 0)
2022            written = ret;
2023
2024        if (!ret && (old_size != i_size_read(inode) ||
2025            old_clusters != OCFS2_I(inode)->ip_clusters ||
2026            has_refcount)) {
2027            ret = jbd2_journal_force_commit(osb->journal->j_journal);
2028            if (ret < 0)
2029                written = ret;
2030        }
2031
2032        if (!ret)
2033            ret = filemap_fdatawait_range(file->f_mapping, pos,
2034                              pos + count - 1);
2035    }
2036
2037    /*
2038     * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2039     * function pointer which is called when o_direct io completes so that
2040     * it can unlock our rw lock. (it's the clustered equivalent of
2041     * i_alloc_sem; protects truncate from racing with pending ios).
2042     * Unfortunately there are error cases which call end_io and others
2043     * that don't. so we don't have to unlock the rw_lock if either an
2044     * async dio is going to do it in the future or an end_io after an
2045     * error has already done it.
2046     */
2047    if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
2048        rw_level = -1;
2049        have_alloc_sem = 0;
2050    }
2051
2052out:
2053    if (rw_level != -1)
2054        ocfs2_rw_unlock(inode, rw_level);
2055
2056out_sems:
2057    if (have_alloc_sem)
2058        up_read(&inode->i_alloc_sem);
2059
2060    mutex_unlock(&inode->i_mutex);
2061
2062    if (written)
2063        ret = written;
2064    mlog_exit(ret);
2065    return ret;
2066}
2067
2068static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
2069                struct file *out,
2070                struct splice_desc *sd)
2071{
2072    int ret;
2073
2074    ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
2075                        sd->total_len, 0, NULL, NULL);
2076    if (ret < 0) {
2077        mlog_errno(ret);
2078        return ret;
2079    }
2080
2081    return splice_from_pipe_feed(pipe, sd, pipe_to_file);
2082}
2083
2084static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2085                       struct file *out,
2086                       loff_t *ppos,
2087                       size_t len,
2088                       unsigned int flags)
2089{
2090    int ret;
2091    struct address_space *mapping = out->f_mapping;
2092    struct inode *inode = mapping->host;
2093    struct splice_desc sd = {
2094        .total_len = len,
2095        .flags = flags,
2096        .pos = *ppos,
2097        .u.file = out,
2098    };
2099
2100    mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
2101           (unsigned int)len,
2102           out->f_path.dentry->d_name.len,
2103           out->f_path.dentry->d_name.name);
2104
2105    if (pipe->inode)
2106        mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
2107
2108    splice_from_pipe_begin(&sd);
2109    do {
2110        ret = splice_from_pipe_next(pipe, &sd);
2111        if (ret <= 0)
2112            break;
2113
2114        mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2115        ret = ocfs2_rw_lock(inode, 1);
2116        if (ret < 0)
2117            mlog_errno(ret);
2118        else {
2119            ret = ocfs2_splice_to_file(pipe, out, &sd);
2120            ocfs2_rw_unlock(inode, 1);
2121        }
2122        mutex_unlock(&inode->i_mutex);
2123    } while (ret > 0);
2124    splice_from_pipe_end(pipe, &sd);
2125
2126    if (pipe->inode)
2127        mutex_unlock(&pipe->inode->i_mutex);
2128
2129    if (sd.num_spliced)
2130        ret = sd.num_spliced;
2131
2132    if (ret > 0) {
2133        unsigned long nr_pages;
2134        int err;
2135
2136        nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2137
2138        err = generic_write_sync(out, *ppos, ret);
2139        if (err)
2140            ret = err;
2141        else
2142            *ppos += ret;
2143
2144        balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2145    }
2146
2147    mlog_exit(ret);
2148    return ret;
2149}
2150
2151static ssize_t ocfs2_file_splice_read(struct file *in,
2152                      loff_t *ppos,
2153                      struct pipe_inode_info *pipe,
2154                      size_t len,
2155                      unsigned int flags)
2156{
2157    int ret = 0, lock_level = 0;
2158    struct inode *inode = in->f_path.dentry->d_inode;
2159
2160    mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
2161           (unsigned int)len,
2162           in->f_path.dentry->d_name.len,
2163           in->f_path.dentry->d_name.name);
2164
2165    /*
2166     * See the comment in ocfs2_file_aio_read()
2167     */
2168    ret = ocfs2_inode_lock_atime(inode, in->f_vfsmnt, &lock_level);
2169    if (ret < 0) {
2170        mlog_errno(ret);
2171        goto bail;
2172    }
2173    ocfs2_inode_unlock(inode, lock_level);
2174
2175    ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2176
2177bail:
2178    mlog_exit(ret);
2179    return ret;
2180}
2181
2182static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2183                   const struct iovec *iov,
2184                   unsigned long nr_segs,
2185                   loff_t pos)
2186{
2187    int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
2188    struct file *filp = iocb->ki_filp;
2189    struct inode *inode = filp->f_path.dentry->d_inode;
2190
2191    mlog_entry("(0x%p, %u, '%.*s')\n", filp,
2192           (unsigned int)nr_segs,
2193           filp->f_path.dentry->d_name.len,
2194           filp->f_path.dentry->d_name.name);
2195
2196    if (!inode) {
2197        ret = -EINVAL;
2198        mlog_errno(ret);
2199        goto bail;
2200    }
2201
2202    /*
2203     * buffered reads protect themselves in ->readpage(). O_DIRECT reads
2204     * need locks to protect pending reads from racing with truncate.
2205     */
2206    if (filp->f_flags & O_DIRECT) {
2207        down_read(&inode->i_alloc_sem);
2208        have_alloc_sem = 1;
2209
2210        ret = ocfs2_rw_lock(inode, 0);
2211        if (ret < 0) {
2212            mlog_errno(ret);
2213            goto bail;
2214        }
2215        rw_level = 0;
2216        /* communicate with ocfs2_dio_end_io */
2217        ocfs2_iocb_set_rw_locked(iocb, rw_level);
2218    }
2219
2220    /*
2221     * We're fine letting folks race truncates and extending
2222     * writes with read across the cluster, just like they can
2223     * locally. Hence no rw_lock during read.
2224     *
2225     * Take and drop the meta data lock to update inode fields
2226     * like i_size. This allows the checks down below
2227     * generic_file_aio_read() a chance of actually working.
2228     */
2229    ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2230    if (ret < 0) {
2231        mlog_errno(ret);
2232        goto bail;
2233    }
2234    ocfs2_inode_unlock(inode, lock_level);
2235
2236    ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2237    if (ret == -EINVAL)
2238        mlog(0, "generic_file_aio_read returned -EINVAL\n");
2239
2240    /* buffered aio wouldn't have proper lock coverage today */
2241    BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
2242
2243    /* see ocfs2_file_aio_write */
2244    if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
2245        rw_level = -1;
2246        have_alloc_sem = 0;
2247    }
2248
2249bail:
2250    if (have_alloc_sem)
2251        up_read(&inode->i_alloc_sem);
2252    if (rw_level != -1)
2253        ocfs2_rw_unlock(inode, rw_level);
2254    mlog_exit(ret);
2255
2256    return ret;
2257}
2258
2259const struct inode_operations ocfs2_file_iops = {
2260    .setattr = ocfs2_setattr,
2261    .getattr = ocfs2_getattr,
2262    .permission = ocfs2_permission,
2263    .setxattr = generic_setxattr,
2264    .getxattr = generic_getxattr,
2265    .listxattr = ocfs2_listxattr,
2266    .removexattr = generic_removexattr,
2267    .fallocate = ocfs2_fallocate,
2268    .fiemap = ocfs2_fiemap,
2269};
2270
2271const struct inode_operations ocfs2_special_file_iops = {
2272    .setattr = ocfs2_setattr,
2273    .getattr = ocfs2_getattr,
2274    .permission = ocfs2_permission,
2275};
2276
2277/*
2278 * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
2279 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2280 */
2281const struct file_operations ocfs2_fops = {
2282    .llseek = generic_file_llseek,
2283    .read = do_sync_read,
2284    .write = do_sync_write,
2285    .mmap = ocfs2_mmap,
2286    .fsync = ocfs2_sync_file,
2287    .release = ocfs2_file_release,
2288    .open = ocfs2_file_open,
2289    .aio_read = ocfs2_file_aio_read,
2290    .aio_write = ocfs2_file_aio_write,
2291    .unlocked_ioctl = ocfs2_ioctl,
2292#ifdef CONFIG_COMPAT
2293    .compat_ioctl = ocfs2_compat_ioctl,
2294#endif
2295    .lock = ocfs2_lock,
2296    .flock = ocfs2_flock,
2297    .splice_read = ocfs2_file_splice_read,
2298    .splice_write = ocfs2_file_splice_write,
2299};
2300
2301const struct file_operations ocfs2_dops = {
2302    .llseek = generic_file_llseek,
2303    .read = generic_read_dir,
2304    .readdir = ocfs2_readdir,
2305    .fsync = ocfs2_sync_file,
2306    .release = ocfs2_dir_release,
2307    .open = ocfs2_dir_open,
2308    .unlocked_ioctl = ocfs2_ioctl,
2309#ifdef CONFIG_COMPAT
2310    .compat_ioctl = ocfs2_compat_ioctl,
2311#endif
2312    .lock = ocfs2_lock,
2313    .flock = ocfs2_flock,
2314};
2315
2316/*
2317 * POSIX-lockless variants of our file_operations.
2318 *
2319 * These will be used if the underlying cluster stack does not support
2320 * posix file locking, if the user passes the "localflocks" mount
2321 * option, or if we have a local-only fs.
2322 *
2323 * ocfs2_flock is in here because all stacks handle UNIX file locks,
2324 * so we still want it in the case of no stack support for
2325 * plocks. Internally, it will do the right thing when asked to ignore
2326 * the cluster.
2327 */
2328const struct file_operations ocfs2_fops_no_plocks = {
2329    .llseek = generic_file_llseek,
2330    .read = do_sync_read,
2331    .write = do_sync_write,
2332    .mmap = ocfs2_mmap,
2333    .fsync = ocfs2_sync_file,
2334    .release = ocfs2_file_release,
2335    .open = ocfs2_file_open,
2336    .aio_read = ocfs2_file_aio_read,
2337    .aio_write = ocfs2_file_aio_write,
2338    .unlocked_ioctl = ocfs2_ioctl,
2339#ifdef CONFIG_COMPAT
2340    .compat_ioctl = ocfs2_compat_ioctl,
2341#endif
2342    .flock = ocfs2_flock,
2343    .splice_read = ocfs2_file_splice_read,
2344    .splice_write = ocfs2_file_splice_write,
2345};
2346
2347const struct file_operations ocfs2_dops_no_plocks = {
2348    .llseek = generic_file_llseek,
2349    .read = generic_read_dir,
2350    .readdir = ocfs2_readdir,
2351    .fsync = ocfs2_sync_file,
2352    .release = ocfs2_dir_release,
2353    .open = ocfs2_dir_open,
2354    .unlocked_ioctl = ocfs2_ioctl,
2355#ifdef CONFIG_COMPAT
2356    .compat_ioctl = ocfs2_compat_ioctl,
2357#endif
2358    .flock = ocfs2_flock,
2359};
2360

Archive Download this file



interactive