Root/fs/ocfs2/refcounttree.c

1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.c
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#include <linux/sort.h>
19#include <cluster/masklog.h>
20#include "ocfs2.h"
21#include "inode.h"
22#include "alloc.h"
23#include "suballoc.h"
24#include "journal.h"
25#include "uptodate.h"
26#include "super.h"
27#include "buffer_head_io.h"
28#include "blockcheck.h"
29#include "refcounttree.h"
30#include "sysfile.h"
31#include "dlmglue.h"
32#include "extent_map.h"
33#include "aops.h"
34#include "xattr.h"
35#include "namei.h"
36#include "ocfs2_trace.h"
37
38#include <linux/bio.h>
39#include <linux/blkdev.h>
40#include <linux/slab.h>
41#include <linux/writeback.h>
42#include <linux/pagevec.h>
43#include <linux/swap.h>
44#include <linux/security.h>
45#include <linux/fsnotify.h>
46#include <linux/quotaops.h>
47#include <linux/namei.h>
48#include <linux/mount.h>
49
50struct ocfs2_cow_context {
51    struct inode *inode;
52    struct file *file;
53    u32 cow_start;
54    u32 cow_len;
55    struct ocfs2_extent_tree data_et;
56    struct ocfs2_refcount_tree *ref_tree;
57    struct buffer_head *ref_root_bh;
58    struct ocfs2_alloc_context *meta_ac;
59    struct ocfs2_alloc_context *data_ac;
60    struct ocfs2_cached_dealloc_ctxt dealloc;
61    void *cow_object;
62    struct ocfs2_post_refcount *post_refcount;
63    int extra_credits;
64    int (*get_clusters)(struct ocfs2_cow_context *context,
65                u32 v_cluster, u32 *p_cluster,
66                u32 *num_clusters,
67                unsigned int *extent_flags);
68    int (*cow_duplicate_clusters)(handle_t *handle,
69                      struct file *file,
70                      u32 cpos, u32 old_cluster,
71                      u32 new_cluster, u32 new_len);
72};
73
74static inline struct ocfs2_refcount_tree *
75cache_info_to_refcount(struct ocfs2_caching_info *ci)
76{
77    return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
78}
79
80static int ocfs2_validate_refcount_block(struct super_block *sb,
81                     struct buffer_head *bh)
82{
83    int rc;
84    struct ocfs2_refcount_block *rb =
85        (struct ocfs2_refcount_block *)bh->b_data;
86
87    trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
88
89    BUG_ON(!buffer_uptodate(bh));
90
91    /*
92     * If the ecc fails, we return the error but otherwise
93     * leave the filesystem running. We know any error is
94     * local to this block.
95     */
96    rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
97    if (rc) {
98        mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
99             (unsigned long long)bh->b_blocknr);
100        return rc;
101    }
102
103
104    if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
105        ocfs2_error(sb,
106                "Refcount block #%llu has bad signature %.*s",
107                (unsigned long long)bh->b_blocknr, 7,
108                rb->rf_signature);
109        return -EINVAL;
110    }
111
112    if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
113        ocfs2_error(sb,
114                "Refcount block #%llu has an invalid rf_blkno "
115                "of %llu",
116                (unsigned long long)bh->b_blocknr,
117                (unsigned long long)le64_to_cpu(rb->rf_blkno));
118        return -EINVAL;
119    }
120
121    if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
122        ocfs2_error(sb,
123                "Refcount block #%llu has an invalid "
124                "rf_fs_generation of #%u",
125                (unsigned long long)bh->b_blocknr,
126                le32_to_cpu(rb->rf_fs_generation));
127        return -EINVAL;
128    }
129
130    return 0;
131}
132
133static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
134                     u64 rb_blkno,
135                     struct buffer_head **bh)
136{
137    int rc;
138    struct buffer_head *tmp = *bh;
139
140    rc = ocfs2_read_block(ci, rb_blkno, &tmp,
141                  ocfs2_validate_refcount_block);
142
143    /* If ocfs2_read_block() got us a new bh, pass it up. */
144    if (!rc && !*bh)
145        *bh = tmp;
146
147    return rc;
148}
149
150static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
151{
152    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
153
154    return rf->rf_blkno;
155}
156
157static struct super_block *
158ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
159{
160    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
161
162    return rf->rf_sb;
163}
164
165static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
166{
167    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
168
169    spin_lock(&rf->rf_lock);
170}
171
172static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
173{
174    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
175
176    spin_unlock(&rf->rf_lock);
177}
178
179static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
180{
181    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
182
183    mutex_lock(&rf->rf_io_mutex);
184}
185
186static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
187{
188    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
189
190    mutex_unlock(&rf->rf_io_mutex);
191}
192
193static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
194    .co_owner = ocfs2_refcount_cache_owner,
195    .co_get_super = ocfs2_refcount_cache_get_super,
196    .co_cache_lock = ocfs2_refcount_cache_lock,
197    .co_cache_unlock = ocfs2_refcount_cache_unlock,
198    .co_io_lock = ocfs2_refcount_cache_io_lock,
199    .co_io_unlock = ocfs2_refcount_cache_io_unlock,
200};
201
202static struct ocfs2_refcount_tree *
203ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
204{
205    struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
206    struct ocfs2_refcount_tree *tree = NULL;
207
208    while (n) {
209        tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
210
211        if (blkno < tree->rf_blkno)
212            n = n->rb_left;
213        else if (blkno > tree->rf_blkno)
214            n = n->rb_right;
215        else
216            return tree;
217    }
218
219    return NULL;
220}
221
222/* osb_lock is already locked. */
223static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
224                       struct ocfs2_refcount_tree *new)
225{
226    u64 rf_blkno = new->rf_blkno;
227    struct rb_node *parent = NULL;
228    struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
229    struct ocfs2_refcount_tree *tmp;
230
231    while (*p) {
232        parent = *p;
233
234        tmp = rb_entry(parent, struct ocfs2_refcount_tree,
235                   rf_node);
236
237        if (rf_blkno < tmp->rf_blkno)
238            p = &(*p)->rb_left;
239        else if (rf_blkno > tmp->rf_blkno)
240            p = &(*p)->rb_right;
241        else {
242            /* This should never happen! */
243            mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
244                 (unsigned long long)rf_blkno);
245            BUG();
246        }
247    }
248
249    rb_link_node(&new->rf_node, parent, p);
250    rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
251}
252
253static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
254{
255    ocfs2_metadata_cache_exit(&tree->rf_ci);
256    ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
257    ocfs2_lock_res_free(&tree->rf_lockres);
258    kfree(tree);
259}
260
261static inline void
262ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
263                    struct ocfs2_refcount_tree *tree)
264{
265    rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
266    if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
267        osb->osb_ref_tree_lru = NULL;
268}
269
270static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
271                    struct ocfs2_refcount_tree *tree)
272{
273    spin_lock(&osb->osb_lock);
274    ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
275    spin_unlock(&osb->osb_lock);
276}
277
278static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
279{
280    struct ocfs2_refcount_tree *tree =
281        container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
282
283    ocfs2_free_refcount_tree(tree);
284}
285
286static inline void
287ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
288{
289    kref_get(&tree->rf_getcnt);
290}
291
292static inline void
293ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
294{
295    kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
296}
297
298static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
299                           struct super_block *sb)
300{
301    ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
302    mutex_init(&new->rf_io_mutex);
303    new->rf_sb = sb;
304    spin_lock_init(&new->rf_lock);
305}
306
307static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
308                    struct ocfs2_refcount_tree *new,
309                    u64 rf_blkno, u32 generation)
310{
311    init_rwsem(&new->rf_sem);
312    ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
313                     rf_blkno, generation);
314}
315
316static struct ocfs2_refcount_tree*
317ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
318{
319    struct ocfs2_refcount_tree *new;
320
321    new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
322    if (!new)
323        return NULL;
324
325    new->rf_blkno = rf_blkno;
326    kref_init(&new->rf_getcnt);
327    ocfs2_init_refcount_tree_ci(new, osb->sb);
328
329    return new;
330}
331
332static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
333                   struct ocfs2_refcount_tree **ret_tree)
334{
335    int ret = 0;
336    struct ocfs2_refcount_tree *tree, *new = NULL;
337    struct buffer_head *ref_root_bh = NULL;
338    struct ocfs2_refcount_block *ref_rb;
339
340    spin_lock(&osb->osb_lock);
341    if (osb->osb_ref_tree_lru &&
342        osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
343        tree = osb->osb_ref_tree_lru;
344    else
345        tree = ocfs2_find_refcount_tree(osb, rf_blkno);
346    if (tree)
347        goto out;
348
349    spin_unlock(&osb->osb_lock);
350
351    new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
352    if (!new) {
353        ret = -ENOMEM;
354        mlog_errno(ret);
355        return ret;
356    }
357    /*
358     * We need the generation to create the refcount tree lock and since
359     * it isn't changed during the tree modification, we are safe here to
360     * read without protection.
361     * We also have to purge the cache after we create the lock since the
362     * refcount block may have the stale data. It can only be trusted when
363     * we hold the refcount lock.
364     */
365    ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
366    if (ret) {
367        mlog_errno(ret);
368        ocfs2_metadata_cache_exit(&new->rf_ci);
369        kfree(new);
370        return ret;
371    }
372
373    ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
374    new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
375    ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
376                      new->rf_generation);
377    ocfs2_metadata_cache_purge(&new->rf_ci);
378
379    spin_lock(&osb->osb_lock);
380    tree = ocfs2_find_refcount_tree(osb, rf_blkno);
381    if (tree)
382        goto out;
383
384    ocfs2_insert_refcount_tree(osb, new);
385
386    tree = new;
387    new = NULL;
388
389out:
390    *ret_tree = tree;
391
392    osb->osb_ref_tree_lru = tree;
393
394    spin_unlock(&osb->osb_lock);
395
396    if (new)
397        ocfs2_free_refcount_tree(new);
398
399    brelse(ref_root_bh);
400    return ret;
401}
402
403static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
404{
405    int ret;
406    struct buffer_head *di_bh = NULL;
407    struct ocfs2_dinode *di;
408
409    ret = ocfs2_read_inode_block(inode, &di_bh);
410    if (ret) {
411        mlog_errno(ret);
412        goto out;
413    }
414
415    BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
416
417    di = (struct ocfs2_dinode *)di_bh->b_data;
418    *ref_blkno = le64_to_cpu(di->i_refcount_loc);
419    brelse(di_bh);
420out:
421    return ret;
422}
423
424static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
425                      struct ocfs2_refcount_tree *tree, int rw)
426{
427    int ret;
428
429    ret = ocfs2_refcount_lock(tree, rw);
430    if (ret) {
431        mlog_errno(ret);
432        goto out;
433    }
434
435    if (rw)
436        down_write(&tree->rf_sem);
437    else
438        down_read(&tree->rf_sem);
439
440out:
441    return ret;
442}
443
444/*
445 * Lock the refcount tree pointed by ref_blkno and return the tree.
446 * In most case, we lock the tree and read the refcount block.
447 * So read it here if the caller really needs it.
448 *
449 * If the tree has been re-created by other node, it will free the
450 * old one and re-create it.
451 */
452int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
453                 u64 ref_blkno, int rw,
454                 struct ocfs2_refcount_tree **ret_tree,
455                 struct buffer_head **ref_bh)
456{
457    int ret, delete_tree = 0;
458    struct ocfs2_refcount_tree *tree = NULL;
459    struct buffer_head *ref_root_bh = NULL;
460    struct ocfs2_refcount_block *rb;
461
462again:
463    ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
464    if (ret) {
465        mlog_errno(ret);
466        return ret;
467    }
468
469    ocfs2_refcount_tree_get(tree);
470
471    ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
472    if (ret) {
473        mlog_errno(ret);
474        ocfs2_refcount_tree_put(tree);
475        goto out;
476    }
477
478    ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
479                    &ref_root_bh);
480    if (ret) {
481        mlog_errno(ret);
482        ocfs2_unlock_refcount_tree(osb, tree, rw);
483        ocfs2_refcount_tree_put(tree);
484        goto out;
485    }
486
487    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
488    /*
489     * If the refcount block has been freed and re-created, we may need
490     * to recreate the refcount tree also.
491     *
492     * Here we just remove the tree from the rb-tree, and the last
493     * kref holder will unlock and delete this refcount_tree.
494     * Then we goto "again" and ocfs2_get_refcount_tree will create
495     * the new refcount tree for us.
496     */
497    if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
498        if (!tree->rf_removed) {
499            ocfs2_erase_refcount_tree_from_list(osb, tree);
500            tree->rf_removed = 1;
501            delete_tree = 1;
502        }
503
504        ocfs2_unlock_refcount_tree(osb, tree, rw);
505        /*
506         * We get an extra reference when we create the refcount
507         * tree, so another put will destroy it.
508         */
509        if (delete_tree)
510            ocfs2_refcount_tree_put(tree);
511        brelse(ref_root_bh);
512        ref_root_bh = NULL;
513        goto again;
514    }
515
516    *ret_tree = tree;
517    if (ref_bh) {
518        *ref_bh = ref_root_bh;
519        ref_root_bh = NULL;
520    }
521out:
522    brelse(ref_root_bh);
523    return ret;
524}
525
526void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
527                struct ocfs2_refcount_tree *tree, int rw)
528{
529    if (rw)
530        up_write(&tree->rf_sem);
531    else
532        up_read(&tree->rf_sem);
533
534    ocfs2_refcount_unlock(tree, rw);
535    ocfs2_refcount_tree_put(tree);
536}
537
538void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
539{
540    struct rb_node *node;
541    struct ocfs2_refcount_tree *tree;
542    struct rb_root *root = &osb->osb_rf_lock_tree;
543
544    while ((node = rb_last(root)) != NULL) {
545        tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
546
547        trace_ocfs2_purge_refcount_trees(
548                (unsigned long long) tree->rf_blkno);
549
550        rb_erase(&tree->rf_node, root);
551        ocfs2_free_refcount_tree(tree);
552    }
553}
554
555/*
556 * Create a refcount tree for an inode.
557 * We take for granted that the inode is already locked.
558 */
559static int ocfs2_create_refcount_tree(struct inode *inode,
560                      struct buffer_head *di_bh)
561{
562    int ret;
563    handle_t *handle = NULL;
564    struct ocfs2_alloc_context *meta_ac = NULL;
565    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
566    struct ocfs2_inode_info *oi = OCFS2_I(inode);
567    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
568    struct buffer_head *new_bh = NULL;
569    struct ocfs2_refcount_block *rb;
570    struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
571    u16 suballoc_bit_start;
572    u32 num_got;
573    u64 suballoc_loc, first_blkno;
574
575    BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
576
577    trace_ocfs2_create_refcount_tree(
578        (unsigned long long)OCFS2_I(inode)->ip_blkno);
579
580    ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
581    if (ret) {
582        mlog_errno(ret);
583        goto out;
584    }
585
586    handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
587    if (IS_ERR(handle)) {
588        ret = PTR_ERR(handle);
589        mlog_errno(ret);
590        goto out;
591    }
592
593    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
594                      OCFS2_JOURNAL_ACCESS_WRITE);
595    if (ret) {
596        mlog_errno(ret);
597        goto out_commit;
598    }
599
600    ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
601                   &suballoc_bit_start, &num_got,
602                   &first_blkno);
603    if (ret) {
604        mlog_errno(ret);
605        goto out_commit;
606    }
607
608    new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
609    if (!new_tree) {
610        ret = -ENOMEM;
611        mlog_errno(ret);
612        goto out_commit;
613    }
614
615    new_bh = sb_getblk(inode->i_sb, first_blkno);
616    ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
617
618    ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
619                      OCFS2_JOURNAL_ACCESS_CREATE);
620    if (ret) {
621        mlog_errno(ret);
622        goto out_commit;
623    }
624
625    /* Initialize ocfs2_refcount_block. */
626    rb = (struct ocfs2_refcount_block *)new_bh->b_data;
627    memset(rb, 0, inode->i_sb->s_blocksize);
628    strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
629    rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
630    rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
631    rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
632    rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
633    rb->rf_blkno = cpu_to_le64(first_blkno);
634    rb->rf_count = cpu_to_le32(1);
635    rb->rf_records.rl_count =
636            cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
637    spin_lock(&osb->osb_lock);
638    rb->rf_generation = osb->s_next_generation++;
639    spin_unlock(&osb->osb_lock);
640
641    ocfs2_journal_dirty(handle, new_bh);
642
643    spin_lock(&oi->ip_lock);
644    oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
645    di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
646    di->i_refcount_loc = cpu_to_le64(first_blkno);
647    spin_unlock(&oi->ip_lock);
648
649    trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
650
651    ocfs2_journal_dirty(handle, di_bh);
652
653    /*
654     * We have to init the tree lock here since it will use
655     * the generation number to create it.
656     */
657    new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
658    ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
659                      new_tree->rf_generation);
660
661    spin_lock(&osb->osb_lock);
662    tree = ocfs2_find_refcount_tree(osb, first_blkno);
663
664    /*
665     * We've just created a new refcount tree in this block. If
666     * we found a refcount tree on the ocfs2_super, it must be
667     * one we just deleted. We free the old tree before
668     * inserting the new tree.
669     */
670    BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
671    if (tree)
672        ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
673    ocfs2_insert_refcount_tree(osb, new_tree);
674    spin_unlock(&osb->osb_lock);
675    new_tree = NULL;
676    if (tree)
677        ocfs2_refcount_tree_put(tree);
678
679out_commit:
680    ocfs2_commit_trans(osb, handle);
681
682out:
683    if (new_tree) {
684        ocfs2_metadata_cache_exit(&new_tree->rf_ci);
685        kfree(new_tree);
686    }
687
688    brelse(new_bh);
689    if (meta_ac)
690        ocfs2_free_alloc_context(meta_ac);
691
692    return ret;
693}
694
695static int ocfs2_set_refcount_tree(struct inode *inode,
696                   struct buffer_head *di_bh,
697                   u64 refcount_loc)
698{
699    int ret;
700    handle_t *handle = NULL;
701    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
702    struct ocfs2_inode_info *oi = OCFS2_I(inode);
703    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
704    struct buffer_head *ref_root_bh = NULL;
705    struct ocfs2_refcount_block *rb;
706    struct ocfs2_refcount_tree *ref_tree;
707
708    BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
709
710    ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
711                       &ref_tree, &ref_root_bh);
712    if (ret) {
713        mlog_errno(ret);
714        return ret;
715    }
716
717    handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
718    if (IS_ERR(handle)) {
719        ret = PTR_ERR(handle);
720        mlog_errno(ret);
721        goto out;
722    }
723
724    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
725                      OCFS2_JOURNAL_ACCESS_WRITE);
726    if (ret) {
727        mlog_errno(ret);
728        goto out_commit;
729    }
730
731    ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
732                      OCFS2_JOURNAL_ACCESS_WRITE);
733    if (ret) {
734        mlog_errno(ret);
735        goto out_commit;
736    }
737
738    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
739    le32_add_cpu(&rb->rf_count, 1);
740
741    ocfs2_journal_dirty(handle, ref_root_bh);
742
743    spin_lock(&oi->ip_lock);
744    oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
745    di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
746    di->i_refcount_loc = cpu_to_le64(refcount_loc);
747    spin_unlock(&oi->ip_lock);
748    ocfs2_journal_dirty(handle, di_bh);
749
750out_commit:
751    ocfs2_commit_trans(osb, handle);
752out:
753    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
754    brelse(ref_root_bh);
755
756    return ret;
757}
758
759int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
760{
761    int ret, delete_tree = 0;
762    handle_t *handle = NULL;
763    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
764    struct ocfs2_inode_info *oi = OCFS2_I(inode);
765    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
766    struct ocfs2_refcount_block *rb;
767    struct inode *alloc_inode = NULL;
768    struct buffer_head *alloc_bh = NULL;
769    struct buffer_head *blk_bh = NULL;
770    struct ocfs2_refcount_tree *ref_tree;
771    int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
772    u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
773    u16 bit = 0;
774
775    if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
776        return 0;
777
778    BUG_ON(!ref_blkno);
779    ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
780    if (ret) {
781        mlog_errno(ret);
782        return ret;
783    }
784
785    rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
786
787    /*
788     * If we are the last user, we need to free the block.
789     * So lock the allocator ahead.
790     */
791    if (le32_to_cpu(rb->rf_count) == 1) {
792        blk = le64_to_cpu(rb->rf_blkno);
793        bit = le16_to_cpu(rb->rf_suballoc_bit);
794        if (rb->rf_suballoc_loc)
795            bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
796        else
797            bg_blkno = ocfs2_which_suballoc_group(blk, bit);
798
799        alloc_inode = ocfs2_get_system_file_inode(osb,
800                    EXTENT_ALLOC_SYSTEM_INODE,
801                    le16_to_cpu(rb->rf_suballoc_slot));
802        if (!alloc_inode) {
803            ret = -ENOMEM;
804            mlog_errno(ret);
805            goto out;
806        }
807        mutex_lock(&alloc_inode->i_mutex);
808
809        ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
810        if (ret) {
811            mlog_errno(ret);
812            goto out_mutex;
813        }
814
815        credits += OCFS2_SUBALLOC_FREE;
816    }
817
818    handle = ocfs2_start_trans(osb, credits);
819    if (IS_ERR(handle)) {
820        ret = PTR_ERR(handle);
821        mlog_errno(ret);
822        goto out_unlock;
823    }
824
825    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
826                      OCFS2_JOURNAL_ACCESS_WRITE);
827    if (ret) {
828        mlog_errno(ret);
829        goto out_commit;
830    }
831
832    ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
833                      OCFS2_JOURNAL_ACCESS_WRITE);
834    if (ret) {
835        mlog_errno(ret);
836        goto out_commit;
837    }
838
839    spin_lock(&oi->ip_lock);
840    oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
841    di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
842    di->i_refcount_loc = 0;
843    spin_unlock(&oi->ip_lock);
844    ocfs2_journal_dirty(handle, di_bh);
845
846    le32_add_cpu(&rb->rf_count , -1);
847    ocfs2_journal_dirty(handle, blk_bh);
848
849    if (!rb->rf_count) {
850        delete_tree = 1;
851        ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
852        ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
853                           alloc_bh, bit, bg_blkno, 1);
854        if (ret)
855            mlog_errno(ret);
856    }
857
858out_commit:
859    ocfs2_commit_trans(osb, handle);
860out_unlock:
861    if (alloc_inode) {
862        ocfs2_inode_unlock(alloc_inode, 1);
863        brelse(alloc_bh);
864    }
865out_mutex:
866    if (alloc_inode) {
867        mutex_unlock(&alloc_inode->i_mutex);
868        iput(alloc_inode);
869    }
870out:
871    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
872    if (delete_tree)
873        ocfs2_refcount_tree_put(ref_tree);
874    brelse(blk_bh);
875
876    return ret;
877}
878
879static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
880                      struct buffer_head *ref_leaf_bh,
881                      u64 cpos, unsigned int len,
882                      struct ocfs2_refcount_rec *ret_rec,
883                      int *index)
884{
885    int i = 0;
886    struct ocfs2_refcount_block *rb =
887        (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
888    struct ocfs2_refcount_rec *rec = NULL;
889
890    for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
891        rec = &rb->rf_records.rl_recs[i];
892
893        if (le64_to_cpu(rec->r_cpos) +
894            le32_to_cpu(rec->r_clusters) <= cpos)
895            continue;
896        else if (le64_to_cpu(rec->r_cpos) > cpos)
897            break;
898
899        /* ok, cpos fail in this rec. Just return. */
900        if (ret_rec)
901            *ret_rec = *rec;
902        goto out;
903    }
904
905    if (ret_rec) {
906        /* We meet with a hole here, so fake the rec. */
907        ret_rec->r_cpos = cpu_to_le64(cpos);
908        ret_rec->r_refcount = 0;
909        if (i < le16_to_cpu(rb->rf_records.rl_used) &&
910            le64_to_cpu(rec->r_cpos) < cpos + len)
911            ret_rec->r_clusters =
912                cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
913        else
914            ret_rec->r_clusters = cpu_to_le32(len);
915    }
916
917out:
918    *index = i;
919}
920
921/*
922 * Try to remove refcount tree. The mechanism is:
923 * 1) Check whether i_clusters == 0, if no, exit.
924 * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
925 * 3) Check whether we have inline xattr stored outside, if yes, exit.
926 * 4) Remove the tree.
927 */
928int ocfs2_try_remove_refcount_tree(struct inode *inode,
929                   struct buffer_head *di_bh)
930{
931    int ret;
932    struct ocfs2_inode_info *oi = OCFS2_I(inode);
933    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
934
935    down_write(&oi->ip_xattr_sem);
936    down_write(&oi->ip_alloc_sem);
937
938    if (oi->ip_clusters)
939        goto out;
940
941    if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
942        goto out;
943
944    if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
945        ocfs2_has_inline_xattr_value_outside(inode, di))
946        goto out;
947
948    ret = ocfs2_remove_refcount_tree(inode, di_bh);
949    if (ret)
950        mlog_errno(ret);
951out:
952    up_write(&oi->ip_alloc_sem);
953    up_write(&oi->ip_xattr_sem);
954    return 0;
955}
956
957/*
958 * Find the end range for a leaf refcount block indicated by
959 * el->l_recs[index].e_blkno.
960 */
961static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
962                       struct buffer_head *ref_root_bh,
963                       struct ocfs2_extent_block *eb,
964                       struct ocfs2_extent_list *el,
965                       int index, u32 *cpos_end)
966{
967    int ret, i, subtree_root;
968    u32 cpos;
969    u64 blkno;
970    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
971    struct ocfs2_path *left_path = NULL, *right_path = NULL;
972    struct ocfs2_extent_tree et;
973    struct ocfs2_extent_list *tmp_el;
974
975    if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
976        /*
977         * We have a extent rec after index, so just use the e_cpos
978         * of the next extent rec.
979         */
980        *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
981        return 0;
982    }
983
984    if (!eb || (eb && !eb->h_next_leaf_blk)) {
985        /*
986         * We are the last extent rec, so any high cpos should
987         * be stored in this leaf refcount block.
988         */
989        *cpos_end = UINT_MAX;
990        return 0;
991    }
992
993    /*
994     * If the extent block isn't the last one, we have to find
995     * the subtree root between this extent block and the next
996     * leaf extent block and get the corresponding e_cpos from
997     * the subroot. Otherwise we may corrupt the b-tree.
998     */
999    ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1000
1001    left_path = ocfs2_new_path_from_et(&et);
1002    if (!left_path) {
1003        ret = -ENOMEM;
1004        mlog_errno(ret);
1005        goto out;
1006    }
1007
1008    cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
1009    ret = ocfs2_find_path(ci, left_path, cpos);
1010    if (ret) {
1011        mlog_errno(ret);
1012        goto out;
1013    }
1014
1015    right_path = ocfs2_new_path_from_path(left_path);
1016    if (!right_path) {
1017        ret = -ENOMEM;
1018        mlog_errno(ret);
1019        goto out;
1020    }
1021
1022    ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
1023    if (ret) {
1024        mlog_errno(ret);
1025        goto out;
1026    }
1027
1028    ret = ocfs2_find_path(ci, right_path, cpos);
1029    if (ret) {
1030        mlog_errno(ret);
1031        goto out;
1032    }
1033
1034    subtree_root = ocfs2_find_subtree_root(&et, left_path,
1035                           right_path);
1036
1037    tmp_el = left_path->p_node[subtree_root].el;
1038    blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
1039    for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
1040        if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
1041            *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
1042            break;
1043        }
1044    }
1045
1046    BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
1047
1048out:
1049    ocfs2_free_path(left_path);
1050    ocfs2_free_path(right_path);
1051    return ret;
1052}
1053
1054/*
1055 * Given a cpos and len, try to find the refcount record which contains cpos.
1056 * 1. If cpos can be found in one refcount record, return the record.
1057 * 2. If cpos can't be found, return a fake record which start from cpos
1058 * and end at a small value between cpos+len and start of the next record.
1059 * This fake record has r_refcount = 0.
1060 */
1061static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1062                  struct buffer_head *ref_root_bh,
1063                  u64 cpos, unsigned int len,
1064                  struct ocfs2_refcount_rec *ret_rec,
1065                  int *index,
1066                  struct buffer_head **ret_bh)
1067{
1068    int ret = 0, i, found;
1069    u32 low_cpos, uninitialized_var(cpos_end);
1070    struct ocfs2_extent_list *el;
1071    struct ocfs2_extent_rec *rec = NULL;
1072    struct ocfs2_extent_block *eb = NULL;
1073    struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
1074    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1075    struct ocfs2_refcount_block *rb =
1076            (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1077
1078    if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
1079        ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
1080                          ret_rec, index);
1081        *ret_bh = ref_root_bh;
1082        get_bh(ref_root_bh);
1083        return 0;
1084    }
1085
1086    el = &rb->rf_list;
1087    low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1088
1089    if (el->l_tree_depth) {
1090        ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1091        if (ret) {
1092            mlog_errno(ret);
1093            goto out;
1094        }
1095
1096        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1097        el = &eb->h_list;
1098
1099        if (el->l_tree_depth) {
1100            ocfs2_error(sb,
1101            "refcount tree %llu has non zero tree "
1102            "depth in leaf btree tree block %llu\n",
1103            (unsigned long long)ocfs2_metadata_cache_owner(ci),
1104            (unsigned long long)eb_bh->b_blocknr);
1105            ret = -EROFS;
1106            goto out;
1107        }
1108    }
1109
1110    found = 0;
1111    for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1112        rec = &el->l_recs[i];
1113
1114        if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1115            found = 1;
1116            break;
1117        }
1118    }
1119
1120    if (found) {
1121        ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
1122                          eb, el, i, &cpos_end);
1123        if (ret) {
1124            mlog_errno(ret);
1125            goto out;
1126        }
1127
1128        if (cpos_end < low_cpos + len)
1129            len = cpos_end - low_cpos;
1130    }
1131
1132    ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1133                    &ref_leaf_bh);
1134    if (ret) {
1135        mlog_errno(ret);
1136        goto out;
1137    }
1138
1139    ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1140                      ret_rec, index);
1141    *ret_bh = ref_leaf_bh;
1142out:
1143    brelse(eb_bh);
1144    return ret;
1145}
1146
1147enum ocfs2_ref_rec_contig {
1148    REF_CONTIG_NONE = 0,
1149    REF_CONTIG_LEFT,
1150    REF_CONTIG_RIGHT,
1151    REF_CONTIG_LEFTRIGHT,
1152};
1153
1154static enum ocfs2_ref_rec_contig
1155    ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1156                    int index)
1157{
1158    if ((rb->rf_records.rl_recs[index].r_refcount ==
1159        rb->rf_records.rl_recs[index + 1].r_refcount) &&
1160        (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1161        le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1162        le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1163        return REF_CONTIG_RIGHT;
1164
1165    return REF_CONTIG_NONE;
1166}
1167
1168static enum ocfs2_ref_rec_contig
1169    ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1170                  int index)
1171{
1172    enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
1173
1174    if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1175        ret = ocfs2_refcount_rec_adjacent(rb, index);
1176
1177    if (index > 0) {
1178        enum ocfs2_ref_rec_contig tmp;
1179
1180        tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1181
1182        if (tmp == REF_CONTIG_RIGHT) {
1183            if (ret == REF_CONTIG_RIGHT)
1184                ret = REF_CONTIG_LEFTRIGHT;
1185            else
1186                ret = REF_CONTIG_LEFT;
1187        }
1188    }
1189
1190    return ret;
1191}
1192
1193static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1194                       int index)
1195{
1196    BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1197           rb->rf_records.rl_recs[index+1].r_refcount);
1198
1199    le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1200             le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1201
1202    if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1203        memmove(&rb->rf_records.rl_recs[index + 1],
1204            &rb->rf_records.rl_recs[index + 2],
1205            sizeof(struct ocfs2_refcount_rec) *
1206            (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1207
1208    memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1209           0, sizeof(struct ocfs2_refcount_rec));
1210    le16_add_cpu(&rb->rf_records.rl_used, -1);
1211}
1212
1213/*
1214 * Merge the refcount rec if we are contiguous with the adjacent recs.
1215 */
1216static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1217                     int index)
1218{
1219    enum ocfs2_ref_rec_contig contig =
1220                ocfs2_refcount_rec_contig(rb, index);
1221
1222    if (contig == REF_CONTIG_NONE)
1223        return;
1224
1225    if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1226        BUG_ON(index == 0);
1227        index--;
1228    }
1229
1230    ocfs2_rotate_refcount_rec_left(rb, index);
1231
1232    if (contig == REF_CONTIG_LEFTRIGHT)
1233        ocfs2_rotate_refcount_rec_left(rb, index);
1234}
1235
1236/*
1237 * Change the refcount indexed by "index" in ref_bh.
1238 * If refcount reaches 0, remove it.
1239 */
1240static int ocfs2_change_refcount_rec(handle_t *handle,
1241                     struct ocfs2_caching_info *ci,
1242                     struct buffer_head *ref_leaf_bh,
1243                     int index, int merge, int change)
1244{
1245    int ret;
1246    struct ocfs2_refcount_block *rb =
1247            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1248    struct ocfs2_refcount_list *rl = &rb->rf_records;
1249    struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1250
1251    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1252                      OCFS2_JOURNAL_ACCESS_WRITE);
1253    if (ret) {
1254        mlog_errno(ret);
1255        goto out;
1256    }
1257
1258    trace_ocfs2_change_refcount_rec(
1259        (unsigned long long)ocfs2_metadata_cache_owner(ci),
1260        index, le32_to_cpu(rec->r_refcount), change);
1261    le32_add_cpu(&rec->r_refcount, change);
1262
1263    if (!rec->r_refcount) {
1264        if (index != le16_to_cpu(rl->rl_used) - 1) {
1265            memmove(rec, rec + 1,
1266                (le16_to_cpu(rl->rl_used) - index - 1) *
1267                sizeof(struct ocfs2_refcount_rec));
1268            memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1269                   0, sizeof(struct ocfs2_refcount_rec));
1270        }
1271
1272        le16_add_cpu(&rl->rl_used, -1);
1273    } else if (merge)
1274        ocfs2_refcount_rec_merge(rb, index);
1275
1276    ocfs2_journal_dirty(handle, ref_leaf_bh);
1277out:
1278    return ret;
1279}
1280
1281static int ocfs2_expand_inline_ref_root(handle_t *handle,
1282                    struct ocfs2_caching_info *ci,
1283                    struct buffer_head *ref_root_bh,
1284                    struct buffer_head **ref_leaf_bh,
1285                    struct ocfs2_alloc_context *meta_ac)
1286{
1287    int ret;
1288    u16 suballoc_bit_start;
1289    u32 num_got;
1290    u64 suballoc_loc, blkno;
1291    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1292    struct buffer_head *new_bh = NULL;
1293    struct ocfs2_refcount_block *new_rb;
1294    struct ocfs2_refcount_block *root_rb =
1295            (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1296
1297    ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1298                      OCFS2_JOURNAL_ACCESS_WRITE);
1299    if (ret) {
1300        mlog_errno(ret);
1301        goto out;
1302    }
1303
1304    ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1305                   &suballoc_bit_start, &num_got,
1306                   &blkno);
1307    if (ret) {
1308        mlog_errno(ret);
1309        goto out;
1310    }
1311
1312    new_bh = sb_getblk(sb, blkno);
1313    if (new_bh == NULL) {
1314        ret = -EIO;
1315        mlog_errno(ret);
1316        goto out;
1317    }
1318    ocfs2_set_new_buffer_uptodate(ci, new_bh);
1319
1320    ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1321                      OCFS2_JOURNAL_ACCESS_CREATE);
1322    if (ret) {
1323        mlog_errno(ret);
1324        goto out;
1325    }
1326
1327    /*
1328     * Initialize ocfs2_refcount_block.
1329     * It should contain the same information as the old root.
1330     * so just memcpy it and change the corresponding field.
1331     */
1332    memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1333
1334    new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1335    new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1336    new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1337    new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1338    new_rb->rf_blkno = cpu_to_le64(blkno);
1339    new_rb->rf_cpos = cpu_to_le32(0);
1340    new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1341    new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1342    ocfs2_journal_dirty(handle, new_bh);
1343
1344    /* Now change the root. */
1345    memset(&root_rb->rf_list, 0, sb->s_blocksize -
1346           offsetof(struct ocfs2_refcount_block, rf_list));
1347    root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1348    root_rb->rf_clusters = cpu_to_le32(1);
1349    root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1350    root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1351    root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1352    root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
1353
1354    ocfs2_journal_dirty(handle, ref_root_bh);
1355
1356    trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
1357        le16_to_cpu(new_rb->rf_records.rl_used));
1358
1359    *ref_leaf_bh = new_bh;
1360    new_bh = NULL;
1361out:
1362    brelse(new_bh);
1363    return ret;
1364}
1365
1366static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1367                       struct ocfs2_refcount_rec *next)
1368{
1369    if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1370        ocfs2_get_ref_rec_low_cpos(next))
1371        return 1;
1372
1373    return 0;
1374}
1375
1376static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1377{
1378    const struct ocfs2_refcount_rec *l = a, *r = b;
1379    u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1380    u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1381
1382    if (l_cpos > r_cpos)
1383        return 1;
1384    if (l_cpos < r_cpos)
1385        return -1;
1386    return 0;
1387}
1388
1389static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1390{
1391    const struct ocfs2_refcount_rec *l = a, *r = b;
1392    u64 l_cpos = le64_to_cpu(l->r_cpos);
1393    u64 r_cpos = le64_to_cpu(r->r_cpos);
1394
1395    if (l_cpos > r_cpos)
1396        return 1;
1397    if (l_cpos < r_cpos)
1398        return -1;
1399    return 0;
1400}
1401
1402static void swap_refcount_rec(void *a, void *b, int size)
1403{
1404    struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1405
1406    tmp = *(struct ocfs2_refcount_rec *)l;
1407    *(struct ocfs2_refcount_rec *)l =
1408            *(struct ocfs2_refcount_rec *)r;
1409    *(struct ocfs2_refcount_rec *)r = tmp;
1410}
1411
1412/*
1413 * The refcount cpos are ordered by their 64bit cpos,
1414 * But we will use the low 32 bit to be the e_cpos in the b-tree.
1415 * So we need to make sure that this pos isn't intersected with others.
1416 *
1417 * Note: The refcount block is already sorted by their low 32 bit cpos,
1418 * So just try the middle pos first, and we will exit when we find
1419 * the good position.
1420 */
1421static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1422                     u32 *split_pos, int *split_index)
1423{
1424    int num_used = le16_to_cpu(rl->rl_used);
1425    int delta, middle = num_used / 2;
1426
1427    for (delta = 0; delta < middle; delta++) {
1428        /* Let's check delta earlier than middle */
1429        if (ocfs2_refcount_rec_no_intersect(
1430                    &rl->rl_recs[middle - delta - 1],
1431                    &rl->rl_recs[middle - delta])) {
1432            *split_index = middle - delta;
1433            break;
1434        }
1435
1436        /* For even counts, don't walk off the end */
1437        if ((middle + delta + 1) == num_used)
1438            continue;
1439
1440        /* Now try delta past middle */
1441        if (ocfs2_refcount_rec_no_intersect(
1442                    &rl->rl_recs[middle + delta],
1443                    &rl->rl_recs[middle + delta + 1])) {
1444            *split_index = middle + delta + 1;
1445            break;
1446        }
1447    }
1448
1449    if (delta >= middle)
1450        return -ENOSPC;
1451
1452    *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1453    return 0;
1454}
1455
1456static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1457                        struct buffer_head *new_bh,
1458                        u32 *split_cpos)
1459{
1460    int split_index = 0, num_moved, ret;
1461    u32 cpos = 0;
1462    struct ocfs2_refcount_block *rb =
1463            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1464    struct ocfs2_refcount_list *rl = &rb->rf_records;
1465    struct ocfs2_refcount_block *new_rb =
1466            (struct ocfs2_refcount_block *)new_bh->b_data;
1467    struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1468
1469    trace_ocfs2_divide_leaf_refcount_block(
1470        (unsigned long long)ref_leaf_bh->b_blocknr,
1471        le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
1472
1473    /*
1474     * XXX: Improvement later.
1475     * If we know all the high 32 bit cpos is the same, no need to sort.
1476     *
1477     * In order to make the whole process safe, we do:
1478     * 1. sort the entries by their low 32 bit cpos first so that we can
1479     * find the split cpos easily.
1480     * 2. call ocfs2_insert_extent to insert the new refcount block.
1481     * 3. move the refcount rec to the new block.
1482     * 4. sort the entries by their 64 bit cpos.
1483     * 5. dirty the new_rb and rb.
1484     */
1485    sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1486         sizeof(struct ocfs2_refcount_rec),
1487         cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1488
1489    ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1490    if (ret) {
1491        mlog_errno(ret);
1492        return ret;
1493    }
1494
1495    new_rb->rf_cpos = cpu_to_le32(cpos);
1496
1497    /* move refcount records starting from split_index to the new block. */
1498    num_moved = le16_to_cpu(rl->rl_used) - split_index;
1499    memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1500           num_moved * sizeof(struct ocfs2_refcount_rec));
1501
1502    /*ok, remove the entries we just moved over to the other block. */
1503    memset(&rl->rl_recs[split_index], 0,
1504           num_moved * sizeof(struct ocfs2_refcount_rec));
1505
1506    /* change old and new rl_used accordingly. */
1507    le16_add_cpu(&rl->rl_used, -num_moved);
1508    new_rl->rl_used = cpu_to_le16(num_moved);
1509
1510    sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1511         sizeof(struct ocfs2_refcount_rec),
1512         cmp_refcount_rec_by_cpos, swap_refcount_rec);
1513
1514    sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1515         sizeof(struct ocfs2_refcount_rec),
1516         cmp_refcount_rec_by_cpos, swap_refcount_rec);
1517
1518    *split_cpos = cpos;
1519    return 0;
1520}
1521
1522static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1523                     struct ocfs2_caching_info *ci,
1524                     struct buffer_head *ref_root_bh,
1525                     struct buffer_head *ref_leaf_bh,
1526                     struct ocfs2_alloc_context *meta_ac)
1527{
1528    int ret;
1529    u16 suballoc_bit_start;
1530    u32 num_got, new_cpos;
1531    u64 suballoc_loc, blkno;
1532    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1533    struct ocfs2_refcount_block *root_rb =
1534            (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1535    struct buffer_head *new_bh = NULL;
1536    struct ocfs2_refcount_block *new_rb;
1537    struct ocfs2_extent_tree ref_et;
1538
1539    BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
1540
1541    ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1542                      OCFS2_JOURNAL_ACCESS_WRITE);
1543    if (ret) {
1544        mlog_errno(ret);
1545        goto out;
1546    }
1547
1548    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1549                      OCFS2_JOURNAL_ACCESS_WRITE);
1550    if (ret) {
1551        mlog_errno(ret);
1552        goto out;
1553    }
1554
1555    ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
1556                   &suballoc_bit_start, &num_got,
1557                   &blkno);
1558    if (ret) {
1559        mlog_errno(ret);
1560        goto out;
1561    }
1562
1563    new_bh = sb_getblk(sb, blkno);
1564    if (new_bh == NULL) {
1565        ret = -EIO;
1566        mlog_errno(ret);
1567        goto out;
1568    }
1569    ocfs2_set_new_buffer_uptodate(ci, new_bh);
1570
1571    ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1572                      OCFS2_JOURNAL_ACCESS_CREATE);
1573    if (ret) {
1574        mlog_errno(ret);
1575        goto out;
1576    }
1577
1578    /* Initialize ocfs2_refcount_block. */
1579    new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1580    memset(new_rb, 0, sb->s_blocksize);
1581    strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1582    new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1583    new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
1584    new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1585    new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1586    new_rb->rf_blkno = cpu_to_le64(blkno);
1587    new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1588    new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1589    new_rb->rf_records.rl_count =
1590                cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1591    new_rb->rf_generation = root_rb->rf_generation;
1592
1593    ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1594    if (ret) {
1595        mlog_errno(ret);
1596        goto out;
1597    }
1598
1599    ocfs2_journal_dirty(handle, ref_leaf_bh);
1600    ocfs2_journal_dirty(handle, new_bh);
1601
1602    ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1603
1604    trace_ocfs2_new_leaf_refcount_block(
1605            (unsigned long long)new_bh->b_blocknr, new_cpos);
1606
1607    /* Insert the new leaf block with the specific offset cpos. */
1608    ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1609                  1, 0, meta_ac);
1610    if (ret)
1611        mlog_errno(ret);
1612
1613out:
1614    brelse(new_bh);
1615    return ret;
1616}
1617
1618static int ocfs2_expand_refcount_tree(handle_t *handle,
1619                      struct ocfs2_caching_info *ci,
1620                      struct buffer_head *ref_root_bh,
1621                      struct buffer_head *ref_leaf_bh,
1622                      struct ocfs2_alloc_context *meta_ac)
1623{
1624    int ret;
1625    struct buffer_head *expand_bh = NULL;
1626
1627    if (ref_root_bh == ref_leaf_bh) {
1628        /*
1629         * the old root bh hasn't been expanded to a b-tree,
1630         * so expand it first.
1631         */
1632        ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1633                           &expand_bh, meta_ac);
1634        if (ret) {
1635            mlog_errno(ret);
1636            goto out;
1637        }
1638    } else {
1639        expand_bh = ref_leaf_bh;
1640        get_bh(expand_bh);
1641    }
1642
1643
1644    /* Now add a new refcount block into the tree.*/
1645    ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1646                        expand_bh, meta_ac);
1647    if (ret)
1648        mlog_errno(ret);
1649out:
1650    brelse(expand_bh);
1651    return ret;
1652}
1653
1654/*
1655 * Adjust the extent rec in b-tree representing ref_leaf_bh.
1656 *
1657 * Only called when we have inserted a new refcount rec at index 0
1658 * which means ocfs2_extent_rec.e_cpos may need some change.
1659 */
1660static int ocfs2_adjust_refcount_rec(handle_t *handle,
1661                     struct ocfs2_caching_info *ci,
1662                     struct buffer_head *ref_root_bh,
1663                     struct buffer_head *ref_leaf_bh,
1664                     struct ocfs2_refcount_rec *rec)
1665{
1666    int ret = 0, i;
1667    u32 new_cpos, old_cpos;
1668    struct ocfs2_path *path = NULL;
1669    struct ocfs2_extent_tree et;
1670    struct ocfs2_refcount_block *rb =
1671        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1672    struct ocfs2_extent_list *el;
1673
1674    if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
1675        goto out;
1676
1677    rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1678    old_cpos = le32_to_cpu(rb->rf_cpos);
1679    new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1680    if (old_cpos <= new_cpos)
1681        goto out;
1682
1683    ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1684
1685    path = ocfs2_new_path_from_et(&et);
1686    if (!path) {
1687        ret = -ENOMEM;
1688        mlog_errno(ret);
1689        goto out;
1690    }
1691
1692    ret = ocfs2_find_path(ci, path, old_cpos);
1693    if (ret) {
1694        mlog_errno(ret);
1695        goto out;
1696    }
1697
1698    /*
1699     * 2 more credits, one for the leaf refcount block, one for
1700     * the extent block contains the extent rec.
1701     */
1702    ret = ocfs2_extend_trans(handle, 2);
1703    if (ret < 0) {
1704        mlog_errno(ret);
1705        goto out;
1706    }
1707
1708    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1709                      OCFS2_JOURNAL_ACCESS_WRITE);
1710    if (ret < 0) {
1711        mlog_errno(ret);
1712        goto out;
1713    }
1714
1715    ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1716                      OCFS2_JOURNAL_ACCESS_WRITE);
1717    if (ret < 0) {
1718        mlog_errno(ret);
1719        goto out;
1720    }
1721
1722    /* change the leaf extent block first. */
1723    el = path_leaf_el(path);
1724
1725    for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1726        if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1727            break;
1728
1729    BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1730
1731    el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1732
1733    /* change the r_cpos in the leaf block. */
1734    rb->rf_cpos = cpu_to_le32(new_cpos);
1735
1736    ocfs2_journal_dirty(handle, path_leaf_bh(path));
1737    ocfs2_journal_dirty(handle, ref_leaf_bh);
1738
1739out:
1740    ocfs2_free_path(path);
1741    return ret;
1742}
1743
1744static int ocfs2_insert_refcount_rec(handle_t *handle,
1745                     struct ocfs2_caching_info *ci,
1746                     struct buffer_head *ref_root_bh,
1747                     struct buffer_head *ref_leaf_bh,
1748                     struct ocfs2_refcount_rec *rec,
1749                     int index, int merge,
1750                     struct ocfs2_alloc_context *meta_ac)
1751{
1752    int ret;
1753    struct ocfs2_refcount_block *rb =
1754            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1755    struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1756    struct buffer_head *new_bh = NULL;
1757
1758    BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1759
1760    if (rf_list->rl_used == rf_list->rl_count) {
1761        u64 cpos = le64_to_cpu(rec->r_cpos);
1762        u32 len = le32_to_cpu(rec->r_clusters);
1763
1764        ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1765                         ref_leaf_bh, meta_ac);
1766        if (ret) {
1767            mlog_errno(ret);
1768            goto out;
1769        }
1770
1771        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1772                         cpos, len, NULL, &index,
1773                         &new_bh);
1774        if (ret) {
1775            mlog_errno(ret);
1776            goto out;
1777        }
1778
1779        ref_leaf_bh = new_bh;
1780        rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1781        rf_list = &rb->rf_records;
1782    }
1783
1784    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1785                      OCFS2_JOURNAL_ACCESS_WRITE);
1786    if (ret) {
1787        mlog_errno(ret);
1788        goto out;
1789    }
1790
1791    if (index < le16_to_cpu(rf_list->rl_used))
1792        memmove(&rf_list->rl_recs[index + 1],
1793            &rf_list->rl_recs[index],
1794            (le16_to_cpu(rf_list->rl_used) - index) *
1795             sizeof(struct ocfs2_refcount_rec));
1796
1797    trace_ocfs2_insert_refcount_rec(
1798        (unsigned long long)ref_leaf_bh->b_blocknr, index,
1799        (unsigned long long)le64_to_cpu(rec->r_cpos),
1800        le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
1801
1802    rf_list->rl_recs[index] = *rec;
1803
1804    le16_add_cpu(&rf_list->rl_used, 1);
1805
1806    if (merge)
1807        ocfs2_refcount_rec_merge(rb, index);
1808
1809    ocfs2_journal_dirty(handle, ref_leaf_bh);
1810
1811    if (index == 0) {
1812        ret = ocfs2_adjust_refcount_rec(handle, ci,
1813                        ref_root_bh,
1814                        ref_leaf_bh, rec);
1815        if (ret)
1816            mlog_errno(ret);
1817    }
1818out:
1819    brelse(new_bh);
1820    return ret;
1821}
1822
1823/*
1824 * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1825 * This is much simple than our b-tree code.
1826 * split_rec is the new refcount rec we want to insert.
1827 * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1828 * increase refcount or decrease a refcount to non-zero).
1829 * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1830 * rec( in case we decrease a refcount to zero).
1831 */
1832static int ocfs2_split_refcount_rec(handle_t *handle,
1833                    struct ocfs2_caching_info *ci,
1834                    struct buffer_head *ref_root_bh,
1835                    struct buffer_head *ref_leaf_bh,
1836                    struct ocfs2_refcount_rec *split_rec,
1837                    int index, int merge,
1838                    struct ocfs2_alloc_context *meta_ac,
1839                    struct ocfs2_cached_dealloc_ctxt *dealloc)
1840{
1841    int ret, recs_need;
1842    u32 len;
1843    struct ocfs2_refcount_block *rb =
1844            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1845    struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1846    struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1847    struct ocfs2_refcount_rec *tail_rec = NULL;
1848    struct buffer_head *new_bh = NULL;
1849
1850    BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1851
1852    trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
1853        le32_to_cpu(orig_rec->r_clusters),
1854        le32_to_cpu(orig_rec->r_refcount),
1855        le64_to_cpu(split_rec->r_cpos),
1856        le32_to_cpu(split_rec->r_clusters),
1857        le32_to_cpu(split_rec->r_refcount));
1858
1859    /*
1860     * If we just need to split the header or tail clusters,
1861     * no more recs are needed, just split is OK.
1862     * Otherwise we at least need one new recs.
1863     */
1864    if (!split_rec->r_refcount &&
1865        (split_rec->r_cpos == orig_rec->r_cpos ||
1866         le64_to_cpu(split_rec->r_cpos) +
1867         le32_to_cpu(split_rec->r_clusters) ==
1868         le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1869        recs_need = 0;
1870    else
1871        recs_need = 1;
1872
1873    /*
1874     * We need one more rec if we split in the middle and the new rec have
1875     * some refcount in it.
1876     */
1877    if (split_rec->r_refcount &&
1878        (split_rec->r_cpos != orig_rec->r_cpos &&
1879         le64_to_cpu(split_rec->r_cpos) +
1880         le32_to_cpu(split_rec->r_clusters) !=
1881         le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1882        recs_need++;
1883
1884    /* If the leaf block don't have enough record, expand it. */
1885    if (le16_to_cpu(rf_list->rl_used) + recs_need >
1886                     le16_to_cpu(rf_list->rl_count)) {
1887        struct ocfs2_refcount_rec tmp_rec;
1888        u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1889        len = le32_to_cpu(orig_rec->r_clusters);
1890        ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1891                         ref_leaf_bh, meta_ac);
1892        if (ret) {
1893            mlog_errno(ret);
1894            goto out;
1895        }
1896
1897        /*
1898         * We have to re-get it since now cpos may be moved to
1899         * another leaf block.
1900         */
1901        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1902                         cpos, len, &tmp_rec, &index,
1903                         &new_bh);
1904        if (ret) {
1905            mlog_errno(ret);
1906            goto out;
1907        }
1908
1909        ref_leaf_bh = new_bh;
1910        rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1911        rf_list = &rb->rf_records;
1912        orig_rec = &rf_list->rl_recs[index];
1913    }
1914
1915    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1916                      OCFS2_JOURNAL_ACCESS_WRITE);
1917    if (ret) {
1918        mlog_errno(ret);
1919        goto out;
1920    }
1921
1922    /*
1923     * We have calculated out how many new records we need and store
1924     * in recs_need, so spare enough space first by moving the records
1925     * after "index" to the end.
1926     */
1927    if (index != le16_to_cpu(rf_list->rl_used) - 1)
1928        memmove(&rf_list->rl_recs[index + 1 + recs_need],
1929            &rf_list->rl_recs[index + 1],
1930            (le16_to_cpu(rf_list->rl_used) - index - 1) *
1931             sizeof(struct ocfs2_refcount_rec));
1932
1933    len = (le64_to_cpu(orig_rec->r_cpos) +
1934          le32_to_cpu(orig_rec->r_clusters)) -
1935          (le64_to_cpu(split_rec->r_cpos) +
1936          le32_to_cpu(split_rec->r_clusters));
1937
1938    /*
1939     * If we have "len", the we will split in the tail and move it
1940     * to the end of the space we have just spared.
1941     */
1942    if (len) {
1943        tail_rec = &rf_list->rl_recs[index + recs_need];
1944
1945        memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1946        le64_add_cpu(&tail_rec->r_cpos,
1947                 le32_to_cpu(tail_rec->r_clusters) - len);
1948        tail_rec->r_clusters = cpu_to_le32(len);
1949    }
1950
1951    /*
1952     * If the split pos isn't the same as the original one, we need to
1953     * split in the head.
1954     *
1955     * Note: We have the chance that split_rec.r_refcount = 0,
1956     * recs_need = 0 and len > 0, which means we just cut the head from
1957     * the orig_rec and in that case we have done some modification in
1958     * orig_rec above, so the check for r_cpos is faked.
1959     */
1960    if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1961        len = le64_to_cpu(split_rec->r_cpos) -
1962              le64_to_cpu(orig_rec->r_cpos);
1963        orig_rec->r_clusters = cpu_to_le32(len);
1964        index++;
1965    }
1966
1967    le16_add_cpu(&rf_list->rl_used, recs_need);
1968
1969    if (split_rec->r_refcount) {
1970        rf_list->rl_recs[index] = *split_rec;
1971        trace_ocfs2_split_refcount_rec_insert(
1972            (unsigned long long)ref_leaf_bh->b_blocknr, index,
1973            (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1974            le32_to_cpu(split_rec->r_clusters),
1975            le32_to_cpu(split_rec->r_refcount));
1976
1977        if (merge)
1978            ocfs2_refcount_rec_merge(rb, index);
1979    }
1980
1981    ocfs2_journal_dirty(handle, ref_leaf_bh);
1982
1983out:
1984    brelse(new_bh);
1985    return ret;
1986}
1987
1988static int __ocfs2_increase_refcount(handle_t *handle,
1989                     struct ocfs2_caching_info *ci,
1990                     struct buffer_head *ref_root_bh,
1991                     u64 cpos, u32 len, int merge,
1992                     struct ocfs2_alloc_context *meta_ac,
1993                     struct ocfs2_cached_dealloc_ctxt *dealloc)
1994{
1995    int ret = 0, index;
1996    struct buffer_head *ref_leaf_bh = NULL;
1997    struct ocfs2_refcount_rec rec;
1998    unsigned int set_len = 0;
1999
2000    trace_ocfs2_increase_refcount_begin(
2001         (unsigned long long)ocfs2_metadata_cache_owner(ci),
2002         (unsigned long long)cpos, len);
2003
2004    while (len) {
2005        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2006                         cpos, len, &rec, &index,
2007                         &ref_leaf_bh);
2008        if (ret) {
2009            mlog_errno(ret);
2010            goto out;
2011        }
2012
2013        set_len = le32_to_cpu(rec.r_clusters);
2014
2015        /*
2016         * Here we may meet with 3 situations:
2017         *
2018         * 1. If we find an already existing record, and the length
2019         * is the same, cool, we just need to increase the r_refcount
2020         * and it is OK.
2021         * 2. If we find a hole, just insert it with r_refcount = 1.
2022         * 3. If we are in the middle of one extent record, split
2023         * it.
2024         */
2025        if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
2026            set_len <= len) {
2027            trace_ocfs2_increase_refcount_change(
2028                (unsigned long long)cpos, set_len,
2029                le32_to_cpu(rec.r_refcount));
2030            ret = ocfs2_change_refcount_rec(handle, ci,
2031                            ref_leaf_bh, index,
2032                            merge, 1);
2033            if (ret) {
2034                mlog_errno(ret);
2035                goto out;
2036            }
2037        } else if (!rec.r_refcount) {
2038            rec.r_refcount = cpu_to_le32(1);
2039
2040            trace_ocfs2_increase_refcount_insert(
2041                 (unsigned long long)le64_to_cpu(rec.r_cpos),
2042                 set_len);
2043            ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
2044                            ref_leaf_bh,
2045                            &rec, index,
2046                            merge, meta_ac);
2047            if (ret) {
2048                mlog_errno(ret);
2049                goto out;
2050            }
2051        } else {
2052            set_len = min((u64)(cpos + len),
2053                      le64_to_cpu(rec.r_cpos) + set_len) - cpos;
2054            rec.r_cpos = cpu_to_le64(cpos);
2055            rec.r_clusters = cpu_to_le32(set_len);
2056            le32_add_cpu(&rec.r_refcount, 1);
2057
2058            trace_ocfs2_increase_refcount_split(
2059                 (unsigned long long)le64_to_cpu(rec.r_cpos),
2060                 set_len, le32_to_cpu(rec.r_refcount));
2061            ret = ocfs2_split_refcount_rec(handle, ci,
2062                               ref_root_bh, ref_leaf_bh,
2063                               &rec, index, merge,
2064                               meta_ac, dealloc);
2065            if (ret) {
2066                mlog_errno(ret);
2067                goto out;
2068            }
2069        }
2070
2071        cpos += set_len;
2072        len -= set_len;
2073        brelse(ref_leaf_bh);
2074        ref_leaf_bh = NULL;
2075    }
2076
2077out:
2078    brelse(ref_leaf_bh);
2079    return ret;
2080}
2081
2082static int ocfs2_remove_refcount_extent(handle_t *handle,
2083                struct ocfs2_caching_info *ci,
2084                struct buffer_head *ref_root_bh,
2085                struct buffer_head *ref_leaf_bh,
2086                struct ocfs2_alloc_context *meta_ac,
2087                struct ocfs2_cached_dealloc_ctxt *dealloc)
2088{
2089    int ret;
2090    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2091    struct ocfs2_refcount_block *rb =
2092            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2093    struct ocfs2_extent_tree et;
2094
2095    BUG_ON(rb->rf_records.rl_used);
2096
2097    trace_ocfs2_remove_refcount_extent(
2098        (unsigned long long)ocfs2_metadata_cache_owner(ci),
2099        (unsigned long long)ref_leaf_bh->b_blocknr,
2100        le32_to_cpu(rb->rf_cpos));
2101
2102    ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2103    ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2104                  1, meta_ac, dealloc);
2105    if (ret) {
2106        mlog_errno(ret);
2107        goto out;
2108    }
2109
2110    ocfs2_remove_from_cache(ci, ref_leaf_bh);
2111
2112    /*
2113     * add the freed block to the dealloc so that it will be freed
2114     * when we run dealloc.
2115     */
2116    ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2117                    le16_to_cpu(rb->rf_suballoc_slot),
2118                    le64_to_cpu(rb->rf_suballoc_loc),
2119                    le64_to_cpu(rb->rf_blkno),
2120                    le16_to_cpu(rb->rf_suballoc_bit));
2121    if (ret) {
2122        mlog_errno(ret);
2123        goto out;
2124    }
2125
2126    ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2127                      OCFS2_JOURNAL_ACCESS_WRITE);
2128    if (ret) {
2129        mlog_errno(ret);
2130        goto out;
2131    }
2132
2133    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2134
2135    le32_add_cpu(&rb->rf_clusters, -1);
2136
2137    /*
2138     * check whether we need to restore the root refcount block if
2139     * there is no leaf extent block at atll.
2140     */
2141    if (!rb->rf_list.l_next_free_rec) {
2142        BUG_ON(rb->rf_clusters);
2143
2144        trace_ocfs2_restore_refcount_block(
2145             (unsigned long long)ref_root_bh->b_blocknr);
2146
2147        rb->rf_flags = 0;
2148        rb->rf_parent = 0;
2149        rb->rf_cpos = 0;
2150        memset(&rb->rf_records, 0, sb->s_blocksize -
2151               offsetof(struct ocfs2_refcount_block, rf_records));
2152        rb->rf_records.rl_count =
2153                cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2154    }
2155
2156    ocfs2_journal_dirty(handle, ref_root_bh);
2157
2158out:
2159    return ret;
2160}
2161
2162int ocfs2_increase_refcount(handle_t *handle,
2163                struct ocfs2_caching_info *ci,
2164                struct buffer_head *ref_root_bh,
2165                u64 cpos, u32 len,
2166                struct ocfs2_alloc_context *meta_ac,
2167                struct ocfs2_cached_dealloc_ctxt *dealloc)
2168{
2169    return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2170                     cpos, len, 1,
2171                     meta_ac, dealloc);
2172}
2173
2174static int ocfs2_decrease_refcount_rec(handle_t *handle,
2175                struct ocfs2_caching_info *ci,
2176                struct buffer_head *ref_root_bh,
2177                struct buffer_head *ref_leaf_bh,
2178                int index, u64 cpos, unsigned int len,
2179                struct ocfs2_alloc_context *meta_ac,
2180                struct ocfs2_cached_dealloc_ctxt *dealloc)
2181{
2182    int ret;
2183    struct ocfs2_refcount_block *rb =
2184            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2185    struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2186
2187    BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2188    BUG_ON(cpos + len >
2189           le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2190
2191    trace_ocfs2_decrease_refcount_rec(
2192        (unsigned long long)ocfs2_metadata_cache_owner(ci),
2193        (unsigned long long)cpos, len);
2194
2195    if (cpos == le64_to_cpu(rec->r_cpos) &&
2196        len == le32_to_cpu(rec->r_clusters))
2197        ret = ocfs2_change_refcount_rec(handle, ci,
2198                        ref_leaf_bh, index, 1, -1);
2199    else {
2200        struct ocfs2_refcount_rec split = *rec;
2201        split.r_cpos = cpu_to_le64(cpos);
2202        split.r_clusters = cpu_to_le32(len);
2203
2204        le32_add_cpu(&split.r_refcount, -1);
2205
2206        ret = ocfs2_split_refcount_rec(handle, ci,
2207                           ref_root_bh, ref_leaf_bh,
2208                           &split, index, 1,
2209                           meta_ac, dealloc);
2210    }
2211
2212    if (ret) {
2213        mlog_errno(ret);
2214        goto out;
2215    }
2216
2217    /* Remove the leaf refcount block if it contains no refcount record. */
2218    if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2219        ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2220                           ref_leaf_bh, meta_ac,
2221                           dealloc);
2222        if (ret)
2223            mlog_errno(ret);
2224    }
2225
2226out:
2227    return ret;
2228}
2229
2230static int __ocfs2_decrease_refcount(handle_t *handle,
2231                     struct ocfs2_caching_info *ci,
2232                     struct buffer_head *ref_root_bh,
2233                     u64 cpos, u32 len,
2234                     struct ocfs2_alloc_context *meta_ac,
2235                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2236                     int delete)
2237{
2238    int ret = 0, index = 0;
2239    struct ocfs2_refcount_rec rec;
2240    unsigned int r_count = 0, r_len;
2241    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2242    struct buffer_head *ref_leaf_bh = NULL;
2243
2244    trace_ocfs2_decrease_refcount(
2245        (unsigned long long)ocfs2_metadata_cache_owner(ci),
2246        (unsigned long long)cpos, len, delete);
2247
2248    while (len) {
2249        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2250                         cpos, len, &rec, &index,
2251                         &ref_leaf_bh);
2252        if (ret) {
2253            mlog_errno(ret);
2254            goto out;
2255        }
2256
2257        r_count = le32_to_cpu(rec.r_refcount);
2258        BUG_ON(r_count == 0);
2259        if (!delete)
2260            BUG_ON(r_count > 1);
2261
2262        r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2263                  le32_to_cpu(rec.r_clusters)) - cpos;
2264
2265        ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2266                          ref_leaf_bh, index,
2267                          cpos, r_len,
2268                          meta_ac, dealloc);
2269        if (ret) {
2270            mlog_errno(ret);
2271            goto out;
2272        }
2273
2274        if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2275            ret = ocfs2_cache_cluster_dealloc(dealloc,
2276                      ocfs2_clusters_to_blocks(sb, cpos),
2277                              r_len);
2278            if (ret) {
2279                mlog_errno(ret);
2280                goto out;
2281            }
2282        }
2283
2284        cpos += r_len;
2285        len -= r_len;
2286        brelse(ref_leaf_bh);
2287        ref_leaf_bh = NULL;
2288    }
2289
2290out:
2291    brelse(ref_leaf_bh);
2292    return ret;
2293}
2294
2295/* Caller must hold refcount tree lock. */
2296int ocfs2_decrease_refcount(struct inode *inode,
2297                handle_t *handle, u32 cpos, u32 len,
2298                struct ocfs2_alloc_context *meta_ac,
2299                struct ocfs2_cached_dealloc_ctxt *dealloc,
2300                int delete)
2301{
2302    int ret;
2303    u64 ref_blkno;
2304    struct ocfs2_inode_info *oi = OCFS2_I(inode);
2305    struct buffer_head *ref_root_bh = NULL;
2306    struct ocfs2_refcount_tree *tree;
2307
2308    BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2309
2310    ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2311    if (ret) {
2312        mlog_errno(ret);
2313        goto out;
2314    }
2315
2316    ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2317    if (ret) {
2318        mlog_errno(ret);
2319        goto out;
2320    }
2321
2322    ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2323                    &ref_root_bh);
2324    if (ret) {
2325        mlog_errno(ret);
2326        goto out;
2327    }
2328
2329    ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2330                    cpos, len, meta_ac, dealloc, delete);
2331    if (ret)
2332        mlog_errno(ret);
2333out:
2334    brelse(ref_root_bh);
2335    return ret;
2336}
2337
2338/*
2339 * Mark the already-existing extent at cpos as refcounted for len clusters.
2340 * This adds the refcount extent flag.
2341 *
2342 * If the existing extent is larger than the request, initiate a
2343 * split. An attempt will be made at merging with adjacent extents.
2344 *
2345 * The caller is responsible for passing down meta_ac if we'll need it.
2346 */
2347static int ocfs2_mark_extent_refcounted(struct inode *inode,
2348                struct ocfs2_extent_tree *et,
2349                handle_t *handle, u32 cpos,
2350                u32 len, u32 phys,
2351                struct ocfs2_alloc_context *meta_ac,
2352                struct ocfs2_cached_dealloc_ctxt *dealloc)
2353{
2354    int ret;
2355
2356    trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
2357                       cpos, len, phys);
2358
2359    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2360        ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2361                "tree, but the feature bit is not set in the "
2362                "super block.", inode->i_ino);
2363        ret = -EROFS;
2364        goto out;
2365    }
2366
2367    ret = ocfs2_change_extent_flag(handle, et, cpos,
2368                       len, phys, meta_ac, dealloc,
2369                       OCFS2_EXT_REFCOUNTED, 0);
2370    if (ret)
2371        mlog_errno(ret);
2372
2373out:
2374    return ret;
2375}
2376
2377/*
2378 * Given some contiguous physical clusters, calculate what we need
2379 * for modifying their refcount.
2380 */
2381static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2382                        struct ocfs2_caching_info *ci,
2383                        struct buffer_head *ref_root_bh,
2384                        u64 start_cpos,
2385                        u32 clusters,
2386                        int *meta_add,
2387                        int *credits)
2388{
2389    int ret = 0, index, ref_blocks = 0, recs_add = 0;
2390    u64 cpos = start_cpos;
2391    struct ocfs2_refcount_block *rb;
2392    struct ocfs2_refcount_rec rec;
2393    struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2394    u32 len;
2395
2396    while (clusters) {
2397        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2398                         cpos, clusters, &rec,
2399                         &index, &ref_leaf_bh);
2400        if (ret) {
2401            mlog_errno(ret);
2402            goto out;
2403        }
2404
2405        if (ref_leaf_bh != prev_bh) {
2406            /*
2407             * Now we encounter a new leaf block, so calculate
2408             * whether we need to extend the old leaf.
2409             */
2410            if (prev_bh) {
2411                rb = (struct ocfs2_refcount_block *)
2412                            prev_bh->b_data;
2413
2414                if (le64_to_cpu(rb->rf_records.rl_used) +
2415                    recs_add >
2416                    le16_to_cpu(rb->rf_records.rl_count))
2417                    ref_blocks++;
2418            }
2419
2420            recs_add = 0;
2421            *credits += 1;
2422            brelse(prev_bh);
2423            prev_bh = ref_leaf_bh;
2424            get_bh(prev_bh);
2425        }
2426
2427        rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2428
2429        trace_ocfs2_calc_refcount_meta_credits_iterate(
2430                recs_add, (unsigned long long)cpos, clusters,
2431                (unsigned long long)le64_to_cpu(rec.r_cpos),
2432                le32_to_cpu(rec.r_clusters),
2433                le32_to_cpu(rec.r_refcount), index);
2434
2435        len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2436              le32_to_cpu(rec.r_clusters)) - cpos;
2437        /*
2438         * We record all the records which will be inserted to the
2439         * same refcount block, so that we can tell exactly whether
2440         * we need a new refcount block or not.
2441         *
2442         * If we will insert a new one, this is easy and only happens
2443         * during adding refcounted flag to the extent, so we don't
2444         * have a chance of spliting. We just need one record.
2445         *
2446         * If the refcount rec already exists, that would be a little
2447         * complicated. we may have to:
2448         * 1) split at the beginning if the start pos isn't aligned.
2449         * we need 1 more record in this case.
2450         * 2) split int the end if the end pos isn't aligned.
2451         * we need 1 more record in this case.
2452         * 3) split in the middle because of file system fragmentation.
2453         * we need 2 more records in this case(we can't detect this
2454         * beforehand, so always think of the worst case).
2455         */
2456        if (rec.r_refcount) {
2457            recs_add += 2;
2458            /* Check whether we need a split at the beginning. */
2459            if (cpos == start_cpos &&
2460                cpos != le64_to_cpu(rec.r_cpos))
2461                recs_add++;
2462
2463            /* Check whether we need a split in the end. */
2464            if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2465                le32_to_cpu(rec.r_clusters))
2466                recs_add++;
2467        } else
2468            recs_add++;
2469
2470        brelse(ref_leaf_bh);
2471        ref_leaf_bh = NULL;
2472        clusters -= len;
2473        cpos += len;
2474    }
2475
2476    if (prev_bh) {
2477        rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2478
2479        if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
2480            le16_to_cpu(rb->rf_records.rl_count))
2481            ref_blocks++;
2482
2483        *credits += 1;
2484    }
2485
2486    if (!ref_blocks)
2487        goto out;
2488
2489    *meta_add += ref_blocks;
2490    *credits += ref_blocks;
2491
2492    /*
2493     * So we may need ref_blocks to insert into the tree.
2494     * That also means we need to change the b-tree and add that number
2495     * of records since we never merge them.
2496     * We need one more block for expansion since the new created leaf
2497     * block is also full and needs split.
2498     */
2499    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2500    if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
2501        struct ocfs2_extent_tree et;
2502
2503        ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2504        *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2505        *credits += ocfs2_calc_extend_credits(sb,
2506                              et.et_root_el,
2507                              ref_blocks);
2508    } else {
2509        *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
2510        *meta_add += 1;
2511    }
2512
2513out:
2514
2515    trace_ocfs2_calc_refcount_meta_credits(
2516        (unsigned long long)start_cpos, clusters,
2517        *meta_add, *credits);
2518    brelse(ref_leaf_bh);
2519    brelse(prev_bh);
2520    return ret;
2521}
2522
2523/*
2524 * For refcount tree, we will decrease some contiguous clusters
2525 * refcount count, so just go through it to see how many blocks
2526 * we gonna touch and whether we need to create new blocks.
2527 *
2528 * Normally the refcount blocks store these refcount should be
2529 * contiguous also, so that we can get the number easily.
2530 * We will at most add split 2 refcount records and 2 more
2531 * refcount blocks, so just check it in a rough way.
2532 *
2533 * Caller must hold refcount tree lock.
2534 */
2535int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2536                      u64 refcount_loc,
2537                      u64 phys_blkno,
2538                      u32 clusters,
2539                      int *credits,
2540                      int *ref_blocks)
2541{
2542    int ret;
2543    struct ocfs2_inode_info *oi = OCFS2_I(inode);
2544    struct buffer_head *ref_root_bh = NULL;
2545    struct ocfs2_refcount_tree *tree;
2546    u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2547
2548    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2549        ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2550                "tree, but the feature bit is not set in the "
2551                "super block.", inode->i_ino);
2552        ret = -EROFS;
2553        goto out;
2554    }
2555
2556    BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2557
2558    ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2559                      refcount_loc, &tree);
2560    if (ret) {
2561        mlog_errno(ret);
2562        goto out;
2563    }
2564
2565    ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
2566                    &ref_root_bh);
2567    if (ret) {
2568        mlog_errno(ret);
2569        goto out;
2570    }
2571
2572    ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2573                           &tree->rf_ci,
2574                           ref_root_bh,
2575                           start_cpos, clusters,
2576                           ref_blocks, credits);
2577    if (ret) {
2578        mlog_errno(ret);
2579        goto out;
2580    }
2581
2582    trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
2583
2584out:
2585    brelse(ref_root_bh);
2586    return ret;
2587}
2588
2589#define MAX_CONTIG_BYTES 1048576
2590
2591static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2592{
2593    return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2594}
2595
2596static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2597{
2598    return ~(ocfs2_cow_contig_clusters(sb) - 1);
2599}
2600
2601/*
2602 * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2603 * find an offset (start + (n * contig_clusters)) that is closest to cpos
2604 * while still being less than or equal to it.
2605 *
2606 * The goal is to break the extent at a multiple of contig_clusters.
2607 */
2608static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2609                         unsigned int start,
2610                         unsigned int cpos)
2611{
2612    BUG_ON(start > cpos);
2613
2614    return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2615}
2616
2617/*
2618 * Given a cluster count of len, pad it out so that it is a multiple
2619 * of contig_clusters.
2620 */
2621static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2622                          unsigned int len)
2623{
2624    unsigned int padded =
2625        (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2626        ocfs2_cow_contig_mask(sb);
2627
2628    /* Did we wrap? */
2629    if (padded < len)
2630        padded = UINT_MAX;
2631
2632    return padded;
2633}
2634
2635/*
2636 * Calculate out the start and number of virtual clusters we need to to CoW.
2637 *
2638 * cpos is vitual start cluster position we want to do CoW in a
2639 * file and write_len is the cluster length.
2640 * max_cpos is the place where we want to stop CoW intentionally.
2641 *
2642 * Normal we will start CoW from the beginning of extent record cotaining cpos.
2643 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2644 * get good I/O from the resulting extent tree.
2645 */
2646static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2647                       struct ocfs2_extent_list *el,
2648                       u32 cpos,
2649                       u32 write_len,
2650                       u32 max_cpos,
2651                       u32 *cow_start,
2652                       u32 *cow_len)
2653{
2654    int ret = 0;
2655    int tree_height = le16_to_cpu(el->l_tree_depth), i;
2656    struct buffer_head *eb_bh = NULL;
2657    struct ocfs2_extent_block *eb = NULL;
2658    struct ocfs2_extent_rec *rec;
2659    unsigned int want_clusters, rec_end = 0;
2660    int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2661    int leaf_clusters;
2662
2663    BUG_ON(cpos + write_len > max_cpos);
2664
2665    if (tree_height > 0) {
2666        ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2667        if (ret) {
2668            mlog_errno(ret);
2669            goto out;
2670        }
2671
2672        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2673        el = &eb->h_list;
2674
2675        if (el->l_tree_depth) {
2676            ocfs2_error(inode->i_sb,
2677                    "Inode %lu has non zero tree depth in "
2678                    "leaf block %llu\n", inode->i_ino,
2679                    (unsigned long long)eb_bh->b_blocknr);
2680            ret = -EROFS;
2681            goto out;
2682        }
2683    }
2684
2685    *cow_len = 0;
2686    for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2687        rec = &el->l_recs[i];
2688
2689        if (ocfs2_is_empty_extent(rec)) {
2690            mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2691                    "index %d\n", inode->i_ino, i);
2692            continue;
2693        }
2694
2695        if (le32_to_cpu(rec->e_cpos) +
2696            le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2697            continue;
2698
2699        if (*cow_len == 0) {
2700            /*
2701             * We should find a refcounted record in the
2702             * first pass.
2703             */
2704            BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2705            *cow_start = le32_to_cpu(rec->e_cpos);
2706        }
2707
2708        /*
2709         * If we encounter a hole, a non-refcounted record or
2710         * pass the max_cpos, stop the search.
2711         */
2712        if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2713            (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2714            (max_cpos <= le32_to_cpu(rec->e_cpos)))
2715            break;
2716
2717        leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2718        rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2719        if (rec_end > max_cpos) {
2720            rec_end = max_cpos;
2721            leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2722        }
2723
2724        /*
2725         * How many clusters do we actually need from
2726         * this extent? First we see how many we actually
2727         * need to complete the write. If that's smaller
2728         * than contig_clusters, we try for contig_clusters.
2729         */
2730        if (!*cow_len)
2731            want_clusters = write_len;
2732        else
2733            want_clusters = (cpos + write_len) -
2734                (*cow_start + *cow_len);
2735        if (want_clusters < contig_clusters)
2736            want_clusters = contig_clusters;
2737
2738        /*
2739         * If the write does not cover the whole extent, we
2740         * need to calculate how we're going to split the extent.
2741         * We try to do it on contig_clusters boundaries.
2742         *
2743         * Any extent smaller than contig_clusters will be
2744         * CoWed in its entirety.
2745         */
2746        if (leaf_clusters <= contig_clusters)
2747            *cow_len += leaf_clusters;
2748        else if (*cow_len || (*cow_start == cpos)) {
2749            /*
2750             * This extent needs to be CoW'd from its
2751             * beginning, so all we have to do is compute
2752             * how many clusters to grab. We align
2753             * want_clusters to the edge of contig_clusters
2754             * to get better I/O.
2755             */
2756            want_clusters = ocfs2_cow_align_length(inode->i_sb,
2757                                   want_clusters);
2758
2759            if (leaf_clusters < want_clusters)
2760                *cow_len += leaf_clusters;
2761            else
2762                *cow_len += want_clusters;
2763        } else if ((*cow_start + contig_clusters) >=
2764               (cpos + write_len)) {
2765            /*
2766             * Breaking off contig_clusters at the front
2767             * of the extent will cover our write. That's
2768             * easy.
2769             */
2770            *cow_len = contig_clusters;
2771        } else if ((rec_end - cpos) <= contig_clusters) {
2772            /*
2773             * Breaking off contig_clusters at the tail of
2774             * this extent will cover cpos.
2775             */
2776            *cow_start = rec_end - contig_clusters;
2777            *cow_len = contig_clusters;
2778        } else if ((rec_end - cpos) <= want_clusters) {
2779            /*
2780             * While we can't fit the entire write in this
2781             * extent, we know that the write goes from cpos
2782             * to the end of the extent. Break that off.
2783             * We try to break it at some multiple of
2784             * contig_clusters from the front of the extent.
2785             * Failing that (ie, cpos is within
2786             * contig_clusters of the front), we'll CoW the
2787             * entire extent.
2788             */
2789            *cow_start = ocfs2_cow_align_start(inode->i_sb,
2790                               *cow_start, cpos);
2791            *cow_len = rec_end - *cow_start;
2792        } else {
2793            /*
2794             * Ok, the entire write lives in the middle of
2795             * this extent. Let's try to slice the extent up
2796             * nicely. Optimally, our CoW region starts at
2797             * m*contig_clusters from the beginning of the
2798             * extent and goes for n*contig_clusters,
2799             * covering the entire write.
2800             */
2801            *cow_start = ocfs2_cow_align_start(inode->i_sb,
2802                               *cow_start, cpos);
2803
2804            want_clusters = (cpos + write_len) - *cow_start;
2805            want_clusters = ocfs2_cow_align_length(inode->i_sb,
2806                                   want_clusters);
2807            if (*cow_start + want_clusters <= rec_end)
2808                *cow_len = want_clusters;
2809            else
2810                *cow_len = rec_end - *cow_start;
2811        }
2812
2813        /* Have we covered our entire write yet? */
2814        if ((*cow_start + *cow_len) >= (cpos + write_len))
2815            break;
2816
2817        /*
2818         * If we reach the end of the extent block and don't get enough
2819         * clusters, continue with the next extent block if possible.
2820         */
2821        if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2822            eb && eb->h_next_leaf_blk) {
2823            brelse(eb_bh);
2824            eb_bh = NULL;
2825
2826            ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2827                           le64_to_cpu(eb->h_next_leaf_blk),
2828                           &eb_bh);
2829            if (ret) {
2830                mlog_errno(ret);
2831                goto out;
2832            }
2833
2834            eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2835            el = &eb->h_list;
2836            i = -1;
2837        }
2838    }
2839
2840out:
2841    brelse(eb_bh);
2842    return ret;
2843}
2844
2845/*
2846 * Prepare meta_ac, data_ac and calculate credits when we want to add some
2847 * num_clusters in data_tree "et" and change the refcount for the old
2848 * clusters(starting form p_cluster) in the refcount tree.
2849 *
2850 * Note:
2851 * 1. since we may split the old tree, so we at most will need num_clusters + 2
2852 * more new leaf records.
2853 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2854 * just give data_ac = NULL.
2855 */
2856static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2857                    u32 p_cluster, u32 num_clusters,
2858                    struct ocfs2_extent_tree *et,
2859                    struct ocfs2_caching_info *ref_ci,
2860                    struct buffer_head *ref_root_bh,
2861                    struct ocfs2_alloc_context **meta_ac,
2862                    struct ocfs2_alloc_context **data_ac,
2863                    int *credits)
2864{
2865    int ret = 0, meta_add = 0;
2866    int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2867
2868    if (num_free_extents < 0) {
2869        ret = num_free_extents;
2870        mlog_errno(ret);
2871        goto out;
2872    }
2873
2874    if (num_free_extents < num_clusters + 2)
2875        meta_add =
2876            ocfs2_extend_meta_needed(et->et_root_el);
2877
2878    *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2879                          num_clusters + 2);
2880
2881    ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2882                           p_cluster, num_clusters,
2883                           &meta_add, credits);
2884    if (ret) {
2885        mlog_errno(ret);
2886        goto out;
2887    }
2888
2889    trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
2890    ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2891                        meta_ac);
2892    if (ret) {
2893        mlog_errno(ret);
2894        goto out;
2895    }
2896
2897    if (data_ac) {
2898        ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2899                         data_ac);
2900        if (ret)
2901            mlog_errno(ret);
2902    }
2903
2904out:
2905    if (ret) {
2906        if (*meta_ac) {
2907            ocfs2_free_alloc_context(*meta_ac);
2908            *meta_ac = NULL;
2909        }
2910    }
2911
2912    return ret;
2913}
2914
2915static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2916{
2917    BUG_ON(buffer_dirty(bh));
2918
2919    clear_buffer_mapped(bh);
2920
2921    return 0;
2922}
2923
2924int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2925                     struct file *file,
2926                     u32 cpos, u32 old_cluster,
2927                     u32 new_cluster, u32 new_len)
2928{
2929    int ret = 0, partial;
2930    struct inode *inode = file->f_path.dentry->d_inode;
2931    struct ocfs2_caching_info *ci = INODE_CACHE(inode);
2932    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2933    u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2934    struct page *page;
2935    pgoff_t page_index;
2936    unsigned int from, to, readahead_pages;
2937    loff_t offset, end, map_end;
2938    struct address_space *mapping = inode->i_mapping;
2939
2940    trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2941                           new_cluster, new_len);
2942
2943    readahead_pages =
2944        (ocfs2_cow_contig_clusters(sb) <<
2945         OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
2946    offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2947    end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2948    /*
2949     * We only duplicate pages until we reach the page contains i_size - 1.
2950     * So trim 'end' to i_size.
2951     */
2952    if (end > i_size_read(inode))
2953        end = i_size_read(inode);
2954
2955    while (offset < end) {
2956        page_index = offset >> PAGE_CACHE_SHIFT;
2957        map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
2958        if (map_end > end)
2959            map_end = end;
2960
2961        /* from, to is the offset within the page. */
2962        from = offset & (PAGE_CACHE_SIZE - 1);
2963        to = PAGE_CACHE_SIZE;
2964        if (map_end & (PAGE_CACHE_SIZE - 1))
2965            to = map_end & (PAGE_CACHE_SIZE - 1);
2966
2967        page = find_or_create_page(mapping, page_index, GFP_NOFS);
2968
2969        /*
2970         * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
2971         * can't be dirtied before we CoW it out.
2972         */
2973        if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2974            BUG_ON(PageDirty(page));
2975
2976        if (PageReadahead(page)) {
2977            page_cache_async_readahead(mapping,
2978                           &file->f_ra, file,
2979                           page, page_index,
2980                           readahead_pages);
2981        }
2982
2983        if (!PageUptodate(page)) {
2984            ret = block_read_full_page(page, ocfs2_get_block);
2985            if (ret) {
2986                mlog_errno(ret);
2987                goto unlock;
2988            }
2989            lock_page(page);
2990        }
2991
2992        if (page_has_buffers(page)) {
2993            ret = walk_page_buffers(handle, page_buffers(page),
2994                        from, to, &partial,
2995                        ocfs2_clear_cow_buffer);
2996            if (ret) {
2997                mlog_errno(ret);
2998                goto unlock;
2999            }
3000        }
3001
3002        ocfs2_map_and_dirty_page(inode, handle, from, to,
3003                     page, 0, &new_block);
3004        mark_page_accessed(page);
3005unlock:
3006        unlock_page(page);
3007        page_cache_release(page);
3008        page = NULL;
3009        offset = map_end;
3010        if (ret)
3011            break;
3012    }
3013
3014    return ret;
3015}
3016
3017int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3018                    struct file *file,
3019                    u32 cpos, u32 old_cluster,
3020                    u32 new_cluster, u32 new_len)
3021{
3022    int ret = 0;
3023    struct inode *inode = file->f_path.dentry->d_inode;
3024    struct super_block *sb = inode->i_sb;
3025    struct ocfs2_caching_info *ci = INODE_CACHE(inode);
3026    int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3027    u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3028    u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
3029    struct ocfs2_super *osb = OCFS2_SB(sb);
3030    struct buffer_head *old_bh = NULL;
3031    struct buffer_head *new_bh = NULL;
3032
3033    trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
3034                           new_cluster, new_len);
3035
3036    for (i = 0; i < blocks; i++, old_block++, new_block++) {
3037        new_bh = sb_getblk(osb->sb, new_block);
3038        if (new_bh == NULL) {
3039            ret = -EIO;
3040            mlog_errno(ret);
3041            break;
3042        }
3043
3044        ocfs2_set_new_buffer_uptodate(ci, new_bh);
3045
3046        ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
3047        if (ret) {
3048            mlog_errno(ret);
3049            break;
3050        }
3051
3052        ret = ocfs2_journal_access(handle, ci, new_bh,
3053                       OCFS2_JOURNAL_ACCESS_CREATE);
3054        if (ret) {
3055            mlog_errno(ret);
3056            break;
3057        }
3058
3059        memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3060        ocfs2_journal_dirty(handle, new_bh);
3061
3062        brelse(new_bh);
3063        brelse(old_bh);
3064        new_bh = NULL;
3065        old_bh = NULL;
3066    }
3067
3068    brelse(new_bh);
3069    brelse(old_bh);
3070    return ret;
3071}
3072
3073static int ocfs2_clear_ext_refcount(handle_t *handle,
3074                    struct ocfs2_extent_tree *et,
3075                    u32 cpos, u32 p_cluster, u32 len,
3076                    unsigned int ext_flags,
3077                    struct ocfs2_alloc_context *meta_ac,
3078                    struct ocfs2_cached_dealloc_ctxt *dealloc)
3079{
3080    int ret, index;
3081    struct ocfs2_extent_rec replace_rec;
3082    struct ocfs2_path *path = NULL;
3083    struct ocfs2_extent_list *el;
3084    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
3085    u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
3086
3087    trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
3088                       cpos, len, p_cluster, ext_flags);
3089
3090    memset(&replace_rec, 0, sizeof(replace_rec));
3091    replace_rec.e_cpos = cpu_to_le32(cpos);
3092    replace_rec.e_leaf_clusters = cpu_to_le16(len);
3093    replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
3094                                   p_cluster));
3095    replace_rec.e_flags = ext_flags;
3096    replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
3097
3098    path = ocfs2_new_path_from_et(et);
3099    if (!path) {
3100        ret = -ENOMEM;
3101        mlog_errno(ret);
3102        goto out;
3103    }
3104
3105    ret = ocfs2_find_path(et->et_ci, path, cpos);
3106    if (ret) {
3107        mlog_errno(ret);
3108        goto out;
3109    }
3110
3111    el = path_leaf_el(path);
3112
3113    index = ocfs2_search_extent_list(el, cpos);
3114    if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
3115        ocfs2_error(sb,
3116                "Inode %llu has an extent at cpos %u which can no "
3117                "longer be found.\n",
3118                (unsigned long long)ino, cpos);
3119        ret = -EROFS;
3120        goto out;
3121    }
3122
3123    ret = ocfs2_split_extent(handle, et, path, index,
3124                 &replace_rec, meta_ac, dealloc);
3125    if (ret)
3126        mlog_errno(ret);
3127
3128out:
3129    ocfs2_free_path(path);
3130    return ret;
3131}
3132
3133static int ocfs2_replace_clusters(handle_t *handle,
3134                  struct ocfs2_cow_context *context,
3135                  u32 cpos, u32 old,
3136                  u32 new, u32 len,
3137                  unsigned int ext_flags)
3138{
3139    int ret;
3140    struct ocfs2_caching_info *ci = context->data_et.et_ci;
3141    u64 ino = ocfs2_metadata_cache_owner(ci);
3142
3143    trace_ocfs2_replace_clusters((unsigned long long)ino,
3144                     cpos, old, new, len, ext_flags);
3145
3146    /*If the old clusters is unwritten, no need to duplicate. */
3147    if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3148        ret = context->cow_duplicate_clusters(handle, context->file,
3149                              cpos, old, new, len);
3150        if (ret) {
3151            mlog_errno(ret);
3152            goto out;
3153        }
3154    }
3155
3156    ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3157                       cpos, new, len, ext_flags,
3158                       context->meta_ac, &context->dealloc);
3159    if (ret)
3160        mlog_errno(ret);
3161out:
3162    return ret;
3163}
3164
3165int ocfs2_cow_sync_writeback(struct super_block *sb,
3166                 struct inode *inode,
3167                 u32 cpos, u32 num_clusters)
3168{
3169    int ret = 0;
3170    loff_t offset, end, map_end;
3171    pgoff_t page_index;
3172    struct page *page;
3173
3174    if (ocfs2_should_order_data(inode))
3175        return 0;
3176
3177    offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3178    end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3179
3180    ret = filemap_fdatawrite_range(inode->i_mapping,
3181                       offset, end - 1);
3182    if (ret < 0) {
3183        mlog_errno(ret);
3184        return ret;
3185    }
3186
3187    while (offset < end) {
3188        page_index = offset >> PAGE_CACHE_SHIFT;
3189        map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
3190        if (map_end > end)
3191            map_end = end;
3192
3193        page = find_or_create_page(inode->i_mapping,
3194                       page_index, GFP_NOFS);
3195        BUG_ON(!page);
3196
3197        wait_on_page_writeback(page);
3198        if (PageError(page)) {
3199            ret = -EIO;
3200            mlog_errno(ret);
3201        } else
3202            mark_page_accessed(page);
3203
3204        unlock_page(page);
3205        page_cache_release(page);
3206        page = NULL;
3207        offset = map_end;
3208        if (ret)
3209            break;
3210    }
3211
3212    return ret;
3213}
3214
3215static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3216                 u32 v_cluster, u32 *p_cluster,
3217                 u32 *num_clusters,
3218                 unsigned int *extent_flags)
3219{
3220    return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3221                  num_clusters, extent_flags);
3222}
3223
3224static int ocfs2_make_clusters_writable(struct super_block *sb,
3225                    struct ocfs2_cow_context *context,
3226                    u32 cpos, u32 p_cluster,
3227                    u32 num_clusters, unsigned int e_flags)
3228{
3229    int ret, delete, index, credits = 0;
3230    u32 new_bit, new_len, orig_num_clusters;
3231    unsigned int set_len;
3232    struct ocfs2_super *osb = OCFS2_SB(sb);
3233    handle_t *handle;
3234    struct buffer_head *ref_leaf_bh = NULL;
3235    struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3236    struct ocfs2_refcount_rec rec;
3237
3238    trace_ocfs2_make_clusters_writable(cpos, p_cluster,
3239                       num_clusters, e_flags);
3240
3241    ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3242                         &context->data_et,
3243                         ref_ci,
3244                         context->ref_root_bh,
3245                         &context->meta_ac,
3246                         &context->data_ac, &credits);
3247    if (ret) {
3248        mlog_errno(ret);
3249        return ret;
3250    }
3251
3252    if (context->post_refcount)
3253        credits += context->post_refcount->credits;
3254
3255    credits += context->extra_credits;
3256    handle = ocfs2_start_trans(osb, credits);
3257    if (IS_ERR(handle)) {
3258        ret = PTR_ERR(handle);
3259        mlog_errno(ret);
3260        goto out;
3261    }
3262
3263    orig_num_clusters = num_clusters;
3264
3265    while (num_clusters) {
3266        ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3267                         p_cluster, num_clusters,
3268                         &rec, &index, &ref_leaf_bh);
3269        if (ret) {
3270            mlog_errno(ret);
3271            goto out_commit;
3272        }
3273
3274        BUG_ON(!rec.r_refcount);
3275        set_len = min((u64)p_cluster + num_clusters,
3276                  le64_to_cpu(rec.r_cpos) +
3277                  le32_to_cpu(rec.r_clusters)) - p_cluster;
3278
3279        /*
3280         * There are many different situation here.
3281         * 1. If refcount == 1, remove the flag and don't COW.
3282         * 2. If refcount > 1, allocate clusters.
3283         * Here we may not allocate r_len once at a time, so continue
3284         * until we reach num_clusters.
3285         */
3286        if (le32_to_cpu(rec.r_refcount) == 1) {
3287            delete = 0;
3288            ret = ocfs2_clear_ext_refcount(handle,
3289                               &context->data_et,
3290                               cpos, p_cluster,
3291                               set_len, e_flags,
3292                               context->meta_ac,
3293                               &context->dealloc);
3294            if (ret) {
3295                mlog_errno(ret);
3296                goto out_commit;
3297            }
3298        } else {
3299            delete = 1;
3300
3301            ret = __ocfs2_claim_clusters(handle,
3302                             context->data_ac,
3303                             1, set_len,
3304                             &new_bit, &new_len);
3305            if (ret) {
3306                mlog_errno(ret);
3307                goto out_commit;
3308            }
3309
3310            ret = ocfs2_replace_clusters(handle, context,
3311                             cpos, p_cluster, new_bit,
3312                             new_len, e_flags);
3313            if (ret) {
3314                mlog_errno(ret);
3315                goto out_commit;
3316            }
3317            set_len = new_len;
3318        }
3319
3320        ret = __ocfs2_decrease_refcount(handle, ref_ci,
3321                        context->ref_root_bh,
3322                        p_cluster, set_len,
3323                        context->meta_ac,
3324                        &context->dealloc, delete);
3325        if (ret) {
3326            mlog_errno(ret);
3327            goto out_commit;
3328        }
3329
3330        cpos += set_len;
3331        p_cluster += set_len;
3332        num_clusters -= set_len;
3333        brelse(ref_leaf_bh);
3334        ref_leaf_bh = NULL;
3335    }
3336
3337    /* handle any post_cow action. */
3338    if (context->post_refcount && context->post_refcount->func) {
3339        ret = context->post_refcount->func(context->inode, handle,
3340                        context->post_refcount->para);
3341        if (ret) {
3342            mlog_errno(ret);
3343            goto out_commit;
3344        }
3345    }
3346
3347    /*
3348     * Here we should write the new page out first if we are
3349     * in write-back mode.
3350     */
3351    if (context->get_clusters == ocfs2_di_get_clusters) {
3352        ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
3353                           orig_num_clusters);
3354        if (ret)
3355            mlog_errno(ret);
3356    }
3357
3358out_commit:
3359    ocfs2_commit_trans(osb, handle);
3360
3361out:
3362    if (context->data_ac) {
3363        ocfs2_free_alloc_context(context->data_ac);
3364        context->data_ac = NULL;
3365    }
3366    if (context->meta_ac) {
3367        ocfs2_free_alloc_context(context->meta_ac);
3368        context->meta_ac = NULL;
3369    }
3370    brelse(ref_leaf_bh);
3371
3372    return ret;
3373}
3374
3375static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3376{
3377    int ret = 0;
3378    struct inode *inode = context->inode;
3379    u32 cow_start = context->cow_start, cow_len = context->cow_len;
3380    u32 p_cluster, num_clusters;
3381    unsigned int ext_flags;
3382    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3383
3384    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3385        ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3386                "tree, but the feature bit is not set in the "
3387                "super block.", inode->i_ino);
3388        return -EROFS;
3389    }
3390
3391    ocfs2_init_dealloc_ctxt(&context->dealloc);
3392
3393    while (cow_len) {
3394        ret = context->get_clusters(context, cow_start, &p_cluster,
3395                        &num_clusters, &ext_flags);
3396        if (ret) {
3397            mlog_errno(ret);
3398            break;
3399        }
3400
3401        BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3402
3403        if (cow_len < num_clusters)
3404            num_clusters = cow_len;
3405
3406        ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3407                           cow_start, p_cluster,
3408                           num_clusters, ext_flags);
3409        if (ret) {
3410            mlog_errno(ret);
3411            break;
3412        }
3413
3414        cow_len -= num_clusters;
3415        cow_start += num_clusters;
3416    }
3417
3418    if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3419        ocfs2_schedule_truncate_log_flush(osb, 1);
3420        ocfs2_run_deallocs(osb, &context->dealloc);
3421    }
3422
3423    return ret;
3424}
3425
3426static void ocfs2_readahead_for_cow(struct inode *inode,
3427                    struct file *file,
3428                    u32 start, u32 len)
3429{
3430    struct address_space *mapping;
3431    pgoff_t index;
3432    unsigned long num_pages;
3433    int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
3434
3435    if (!file)
3436        return;
3437
3438    mapping = file->f_mapping;
3439    num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT;
3440    if (!num_pages)
3441        num_pages = 1;
3442
3443    index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT;
3444    page_cache_sync_readahead(mapping, &file->f_ra, file,
3445                  index, num_pages);
3446}
3447
3448/*
3449 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3450 * past max_cpos. This will stop when it runs into a hole or an
3451 * unrefcounted extent.
3452 */
3453static int ocfs2_refcount_cow_hunk(struct inode *inode,
3454                   struct file *file,
3455                   struct buffer_head *di_bh,
3456                   u32 cpos, u32 write_len, u32 max_cpos)
3457{
3458    int ret;
3459    u32 cow_start = 0, cow_len = 0;
3460    struct ocfs2_inode_info *oi = OCFS2_I(inode);
3461    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3462    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3463    struct buffer_head *ref_root_bh = NULL;
3464    struct ocfs2_refcount_tree *ref_tree;
3465    struct ocfs2_cow_context *context = NULL;
3466
3467    BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3468
3469    ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3470                          cpos, write_len, max_cpos,
3471                          &cow_start, &cow_len);
3472    if (ret) {
3473        mlog_errno(ret);
3474        goto out;
3475    }
3476
3477    trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
3478                      cpos, write_len, max_cpos,
3479                      cow_start, cow_len);
3480
3481    BUG_ON(cow_len == 0);
3482
3483    ocfs2_readahead_for_cow(inode, file, cow_start, cow_len);
3484
3485    context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3486    if (!context) {
3487        ret = -ENOMEM;
3488        mlog_errno(ret);
3489        goto out;
3490    }
3491
3492    ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3493                       1, &ref_tree, &ref_root_bh);
3494    if (ret) {
3495        mlog_errno(ret);
3496        goto out;
3497    }
3498
3499    context->inode = inode;
3500    context->cow_start = cow_start;
3501    context->cow_len = cow_len;
3502    context->ref_tree = ref_tree;
3503    context->ref_root_bh = ref_root_bh;
3504    context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3505    context->get_clusters = ocfs2_di_get_clusters;
3506    context->file = file;
3507
3508    ocfs2_init_dinode_extent_tree(&context->data_et,
3509                      INODE_CACHE(inode), di_bh);
3510
3511    ret = ocfs2_replace_cow(context);
3512    if (ret)
3513        mlog_errno(ret);
3514
3515    /*
3516     * truncate the extent map here since no matter whether we meet with
3517     * any error during the action, we shouldn't trust cached extent map
3518     * any more.
3519     */
3520    ocfs2_extent_map_trunc(inode, cow_start);
3521
3522    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3523    brelse(ref_root_bh);
3524out:
3525    kfree(context);
3526    return ret;
3527}
3528
3529/*
3530 * CoW any and all clusters between cpos and cpos+write_len.
3531 * Don't CoW past max_cpos. If this returns successfully, all
3532 * clusters between cpos and cpos+write_len are safe to modify.
3533 */
3534int ocfs2_refcount_cow(struct inode *inode,
3535               struct file *file,
3536               struct buffer_head *di_bh,
3537               u32 cpos, u32 write_len, u32 max_cpos)
3538{
3539    int ret = 0;
3540    u32 p_cluster, num_clusters;
3541    unsigned int ext_flags;
3542
3543    while (write_len) {
3544        ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3545                     &num_clusters, &ext_flags);
3546        if (ret) {
3547            mlog_errno(ret);
3548            break;
3549        }
3550
3551        if (write_len < num_clusters)
3552            num_clusters = write_len;
3553
3554        if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3555            ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos,
3556                              num_clusters, max_cpos);
3557            if (ret) {
3558                mlog_errno(ret);
3559                break;
3560            }
3561        }
3562
3563        write_len -= num_clusters;
3564        cpos += num_clusters;
3565    }
3566
3567    return ret;
3568}
3569
3570static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3571                      u32 v_cluster, u32 *p_cluster,
3572                      u32 *num_clusters,
3573                      unsigned int *extent_flags)
3574{
3575    struct inode *inode = context->inode;
3576    struct ocfs2_xattr_value_root *xv = context->cow_object;
3577
3578    return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3579                    num_clusters, &xv->xr_list,
3580                    extent_flags);
3581}
3582
3583/*
3584 * Given a xattr value root, calculate the most meta/credits we need for
3585 * refcount tree change if we truncate it to 0.
3586 */
3587int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3588                       struct ocfs2_caching_info *ref_ci,
3589                       struct buffer_head *ref_root_bh,
3590                       struct ocfs2_xattr_value_root *xv,
3591                       int *meta_add, int *credits)
3592{
3593    int ret = 0, index, ref_blocks = 0;
3594    u32 p_cluster, num_clusters;
3595    u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3596    struct ocfs2_refcount_block *rb;
3597    struct ocfs2_refcount_rec rec;
3598    struct buffer_head *ref_leaf_bh = NULL;
3599
3600    while (cpos < clusters) {
3601        ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3602                           &num_clusters, &xv->xr_list,
3603                           NULL);
3604        if (ret) {
3605            mlog_errno(ret);
3606            goto out;
3607        }
3608
3609        cpos += num_clusters;
3610
3611        while (num_clusters) {
3612            ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3613                             p_cluster, num_clusters,
3614                             &rec, &index,
3615                             &ref_leaf_bh);
3616            if (ret) {
3617                mlog_errno(ret);
3618                goto out;
3619            }
3620
3621            BUG_ON(!rec.r_refcount);
3622
3623            rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3624
3625            /*
3626             * We really don't know whether the other clusters is in
3627             * this refcount block or not, so just take the worst
3628             * case that all the clusters are in this block and each
3629             * one will split a refcount rec, so totally we need
3630             * clusters * 2 new refcount rec.
3631             */
3632            if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3633                le16_to_cpu(rb->rf_records.rl_count))
3634                ref_blocks++;
3635
3636            *credits += 1;
3637            brelse(ref_leaf_bh);
3638            ref_leaf_bh = NULL;
3639
3640            if (num_clusters <= le32_to_cpu(rec.r_clusters))
3641                break;
3642            else
3643                num_clusters -= le32_to_cpu(rec.r_clusters);
3644            p_cluster += num_clusters;
3645        }
3646    }
3647
3648    *meta_add += ref_blocks;
3649    if (!ref_blocks)
3650        goto out;
3651
3652    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3653    if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
3654        *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
3655    else {
3656        struct ocfs2_extent_tree et;
3657
3658        ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3659        *credits += ocfs2_calc_extend_credits(inode->i_sb,
3660                              et.et_root_el,
3661                              ref_blocks);
3662    }
3663
3664out:
3665    brelse(ref_leaf_bh);
3666    return ret;
3667}
3668
3669/*
3670 * Do CoW for xattr.
3671 */
3672int ocfs2_refcount_cow_xattr(struct inode *inode,
3673                 struct ocfs2_dinode *di,
3674                 struct ocfs2_xattr_value_buf *vb,
3675                 struct ocfs2_refcount_tree *ref_tree,
3676                 struct buffer_head *ref_root_bh,
3677                 u32 cpos, u32 write_len,
3678                 struct ocfs2_post_refcount *post)
3679{
3680    int ret;
3681    struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3682    struct ocfs2_inode_info *oi = OCFS2_I(inode);
3683    struct ocfs2_cow_context *context = NULL;
3684    u32 cow_start, cow_len;
3685
3686    BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3687
3688    ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3689                          cpos, write_len, UINT_MAX,
3690                          &cow_start, &cow_len);
3691    if (ret) {
3692        mlog_errno(ret);
3693        goto out;
3694    }
3695
3696    BUG_ON(cow_len == 0);
3697
3698    context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3699    if (!context) {
3700        ret = -ENOMEM;
3701        mlog_errno(ret);
3702        goto out;
3703    }
3704
3705    context->inode = inode;
3706    context->cow_start = cow_start;
3707    context->cow_len = cow_len;
3708    context->ref_tree = ref_tree;
3709    context->ref_root_bh = ref_root_bh;
3710    context->cow_object = xv;
3711
3712    context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
3713    /* We need the extra credits for duplicate_clusters by jbd. */
3714    context->extra_credits =
3715        ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3716    context->get_clusters = ocfs2_xattr_value_get_clusters;
3717    context->post_refcount = post;
3718
3719    ocfs2_init_xattr_value_extent_tree(&context->data_et,
3720                       INODE_CACHE(inode), vb);
3721
3722    ret = ocfs2_replace_cow(context);
3723    if (ret)
3724        mlog_errno(ret);
3725
3726out:
3727    kfree(context);
3728    return ret;
3729}
3730
3731/*
3732 * Insert a new extent into refcount tree and mark a extent rec
3733 * as refcounted in the dinode tree.
3734 */
3735int ocfs2_add_refcount_flag(struct inode *inode,
3736                struct ocfs2_extent_tree *data_et,
3737                struct ocfs2_caching_info *ref_ci,
3738                struct buffer_head *ref_root_bh,
3739                u32 cpos, u32 p_cluster, u32 num_clusters,
3740                struct ocfs2_cached_dealloc_ctxt *dealloc,
3741                struct ocfs2_post_refcount *post)
3742{
3743    int ret;
3744    handle_t *handle;
3745    int credits = 1, ref_blocks = 0;
3746    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3747    struct ocfs2_alloc_context *meta_ac = NULL;
3748
3749    ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3750                           ref_ci, ref_root_bh,
3751                           p_cluster, num_clusters,
3752                           &ref_blocks, &credits);
3753    if (ret) {
3754        mlog_errno(ret);
3755        goto out;
3756    }
3757
3758    trace_ocfs2_add_refcount_flag(ref_blocks, credits);
3759
3760    if (ref_blocks) {
3761        ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
3762                            ref_blocks, &meta_ac);
3763        if (ret) {
3764            mlog_errno(ret);
3765            goto out;
3766        }
3767    }
3768
3769    if (post)
3770        credits += post->credits;
3771
3772    handle = ocfs2_start_trans(osb, credits);
3773    if (IS_ERR(handle)) {
3774        ret = PTR_ERR(handle);
3775        mlog_errno(ret);
3776        goto out;
3777    }
3778
3779    ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3780                       cpos, num_clusters, p_cluster,
3781                       meta_ac, dealloc);
3782    if (ret) {
3783        mlog_errno(ret);
3784        goto out_commit;
3785    }
3786
3787    ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3788                    p_cluster, num_clusters, 0,
3789                    meta_ac, dealloc);
3790    if (ret) {
3791        mlog_errno(ret);
3792        goto out_commit;
3793    }
3794
3795    if (post && post->func) {
3796        ret = post->func(inode, handle, post->para);
3797        if (ret)
3798            mlog_errno(ret);
3799    }
3800
3801out_commit:
3802    ocfs2_commit_trans(osb, handle);
3803out:
3804    if (meta_ac)
3805        ocfs2_free_alloc_context(meta_ac);
3806    return ret;
3807}
3808
3809static int ocfs2_change_ctime(struct inode *inode,
3810                  struct buffer_head *di_bh)
3811{
3812    int ret;
3813    handle_t *handle;
3814    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3815
3816    handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3817                   OCFS2_INODE_UPDATE_CREDITS);
3818    if (IS_ERR(handle)) {
3819        ret = PTR_ERR(handle);
3820        mlog_errno(ret);
3821        goto out;
3822    }
3823
3824    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3825                      OCFS2_JOURNAL_ACCESS_WRITE);
3826    if (ret) {
3827        mlog_errno(ret);
3828        goto out_commit;
3829    }
3830
3831    inode->i_ctime = CURRENT_TIME;
3832    di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3833    di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3834
3835    ocfs2_journal_dirty(handle, di_bh);
3836
3837out_commit:
3838    ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3839out:
3840    return ret;
3841}
3842
3843static int ocfs2_attach_refcount_tree(struct inode *inode,
3844                      struct buffer_head *di_bh)
3845{
3846    int ret, data_changed = 0;
3847    struct buffer_head *ref_root_bh = NULL;
3848    struct ocfs2_inode_info *oi = OCFS2_I(inode);
3849    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3850    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3851    struct ocfs2_refcount_tree *ref_tree;
3852    unsigned int ext_flags;
3853    loff_t size;
3854    u32 cpos, num_clusters, clusters, p_cluster;
3855    struct ocfs2_cached_dealloc_ctxt dealloc;
3856    struct ocfs2_extent_tree di_et;
3857
3858    ocfs2_init_dealloc_ctxt(&dealloc);
3859
3860    if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
3861        ret = ocfs2_create_refcount_tree(inode, di_bh);
3862        if (ret) {
3863            mlog_errno(ret);
3864            goto out;
3865        }
3866    }
3867
3868    BUG_ON(!di->i_refcount_loc);
3869    ret = ocfs2_lock_refcount_tree(osb,
3870                       le64_to_cpu(di->i_refcount_loc), 1,
3871                       &ref_tree, &ref_root_bh);
3872    if (ret) {
3873        mlog_errno(ret);
3874        goto out;
3875    }
3876
3877    if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
3878        goto attach_xattr;
3879
3880    ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3881
3882    size = i_size_read(inode);
3883    clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3884
3885    cpos = 0;
3886    while (cpos < clusters) {
3887        ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3888                     &num_clusters, &ext_flags);
3889
3890        if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3891            ret = ocfs2_add_refcount_flag(inode, &di_et,
3892                              &ref_tree->rf_ci,
3893                              ref_root_bh, cpos,
3894                              p_cluster, num_clusters,
3895                              &dealloc, NULL);
3896            if (ret) {
3897                mlog_errno(ret);
3898                goto unlock;
3899            }
3900
3901            data_changed = 1;
3902        }
3903        cpos += num_clusters;
3904    }
3905
3906attach_xattr:
3907    if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3908        ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3909                               &ref_tree->rf_ci,
3910                               ref_root_bh,
3911                               &dealloc);
3912        if (ret) {
3913            mlog_errno(ret);
3914            goto unlock;
3915        }
3916    }
3917
3918    if (data_changed) {
3919        ret = ocfs2_change_ctime(inode, di_bh);
3920        if (ret)
3921            mlog_errno(ret);
3922    }
3923
3924unlock:
3925    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3926    brelse(ref_root_bh);
3927
3928    if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3929        ocfs2_schedule_truncate_log_flush(osb, 1);
3930        ocfs2_run_deallocs(osb, &dealloc);
3931    }
3932out:
3933    /*
3934     * Empty the extent map so that we may get the right extent
3935     * record from the disk.
3936     */
3937    ocfs2_extent_map_trunc(inode, 0);
3938
3939    return ret;
3940}
3941
3942static int ocfs2_add_refcounted_extent(struct inode *inode,
3943                   struct ocfs2_extent_tree *et,
3944                   struct ocfs2_caching_info *ref_ci,
3945                   struct buffer_head *ref_root_bh,
3946                   u32 cpos, u32 p_cluster, u32 num_clusters,
3947                   unsigned int ext_flags,
3948                   struct ocfs2_cached_dealloc_ctxt *dealloc)
3949{
3950    int ret;
3951    handle_t *handle;
3952    int credits = 0;
3953    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3954    struct ocfs2_alloc_context *meta_ac = NULL;
3955
3956    ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3957                         p_cluster, num_clusters,
3958                         et, ref_ci,
3959                         ref_root_bh, &meta_ac,
3960                         NULL, &credits);
3961    if (ret) {
3962        mlog_errno(ret);
3963        goto out;
3964    }
3965
3966    handle = ocfs2_start_trans(osb, credits);
3967    if (IS_ERR(handle)) {
3968        ret = PTR_ERR(handle);
3969        mlog_errno(ret);
3970        goto out;
3971    }
3972
3973    ret = ocfs2_insert_extent(handle, et, cpos,
3974            ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
3975            num_clusters, ext_flags, meta_ac);
3976    if (ret) {
3977        mlog_errno(ret);
3978        goto out_commit;
3979    }
3980
3981    ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3982                      p_cluster, num_clusters,
3983                      meta_ac, dealloc);
3984    if (ret)
3985        mlog_errno(ret);
3986
3987out_commit:
3988    ocfs2_commit_trans(osb, handle);
3989out:
3990    if (meta_ac)
3991        ocfs2_free_alloc_context(meta_ac);
3992    return ret;
3993}
3994
3995static int ocfs2_duplicate_inline_data(struct inode *s_inode,
3996                       struct buffer_head *s_bh,
3997                       struct inode *t_inode,
3998                       struct buffer_head *t_bh)
3999{
4000    int ret;
4001    handle_t *handle;
4002    struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4003    struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
4004    struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
4005
4006    BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
4007
4008    handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
4009    if (IS_ERR(handle)) {
4010        ret = PTR_ERR(handle);
4011        mlog_errno(ret);
4012        goto out;
4013    }
4014
4015    ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
4016                      OCFS2_JOURNAL_ACCESS_WRITE);
4017    if (ret) {
4018        mlog_errno(ret);
4019        goto out_commit;
4020    }
4021
4022    t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
4023    memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
4024           le16_to_cpu(s_di->id2.i_data.id_count));
4025    spin_lock(&OCFS2_I(t_inode)->ip_lock);
4026    OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
4027    t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
4028    spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4029
4030    ocfs2_journal_dirty(handle, t_bh);
4031
4032out_commit:
4033    ocfs2_commit_trans(osb, handle);
4034out:
4035    return ret;
4036}
4037
4038static int ocfs2_duplicate_extent_list(struct inode *s_inode,
4039                struct inode *t_inode,
4040                struct buffer_head *t_bh,
4041                struct ocfs2_caching_info *ref_ci,
4042                struct buffer_head *ref_root_bh,
4043                struct ocfs2_cached_dealloc_ctxt *dealloc)
4044{
4045    int ret = 0;
4046    u32 p_cluster, num_clusters, clusters, cpos;
4047    loff_t size;
4048    unsigned int ext_flags;
4049    struct ocfs2_extent_tree et;
4050
4051    ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
4052
4053    size = i_size_read(s_inode);
4054    clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
4055
4056    cpos = 0;
4057    while (cpos < clusters) {
4058        ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
4059                     &num_clusters, &ext_flags);
4060
4061        if (p_cluster) {
4062            ret = ocfs2_add_refcounted_extent(t_inode, &et,
4063                              ref_ci, ref_root_bh,
4064                              cpos, p_cluster,
4065                              num_clusters,
4066                              ext_flags,
4067                              dealloc);
4068            if (ret) {
4069                mlog_errno(ret);
4070                goto out;
4071            }
4072        }
4073
4074        cpos += num_clusters;
4075    }
4076
4077out:
4078    return ret;
4079}
4080
4081/*
4082 * change the new file's attributes to the src.
4083 *
4084 * reflink creates a snapshot of a file, that means the attributes
4085 * must be identical except for three exceptions - nlink, ino, and ctime.
4086 */
4087static int ocfs2_complete_reflink(struct inode *s_inode,
4088                  struct buffer_head *s_bh,
4089                  struct inode *t_inode,
4090                  struct buffer_head *t_bh,
4091                  bool preserve)
4092{
4093    int ret;
4094    handle_t *handle;
4095    struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
4096    struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
4097    loff_t size = i_size_read(s_inode);
4098
4099    handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
4100                   OCFS2_INODE_UPDATE_CREDITS);
4101    if (IS_ERR(handle)) {
4102        ret = PTR_ERR(handle);
4103        mlog_errno(ret);
4104        return ret;
4105    }
4106
4107    ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
4108                      OCFS2_JOURNAL_ACCESS_WRITE);
4109    if (ret) {
4110        mlog_errno(ret);
4111        goto out_commit;
4112    }
4113
4114    spin_lock(&OCFS2_I(t_inode)->ip_lock);
4115    OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
4116    OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
4117    OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4118    spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4119    i_size_write(t_inode, size);
4120    t_inode->i_blocks = s_inode->i_blocks;
4121
4122    di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4123    di->i_clusters = s_di->i_clusters;
4124    di->i_size = s_di->i_size;
4125    di->i_dyn_features = s_di->i_dyn_features;
4126    di->i_attr = s_di->i_attr;
4127
4128    if (preserve) {
4129        t_inode->i_uid = s_inode->i_uid;
4130        t_inode->i_gid = s_inode->i_gid;
4131        t_inode->i_mode = s_inode->i_mode;
4132        di->i_uid = s_di->i_uid;
4133        di->i_gid = s_di->i_gid;
4134        di->i_mode = s_di->i_mode;
4135
4136        /*
4137         * update time.
4138         * we want mtime to appear identical to the source and
4139         * update ctime.
4140         */
4141        t_inode->i_ctime = CURRENT_TIME;
4142
4143        di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
4144        di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
4145
4146        t_inode->i_mtime = s_inode->i_mtime;
4147        di->i_mtime = s_di->i_mtime;
4148        di->i_mtime_nsec = s_di->i_mtime_nsec;
4149    }
4150
4151    ocfs2_journal_dirty(handle, t_bh);
4152
4153out_commit:
4154    ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
4155    return ret;
4156}
4157
4158static int ocfs2_create_reflink_node(struct inode *s_inode,
4159                     struct buffer_head *s_bh,
4160                     struct inode *t_inode,
4161                     struct buffer_head *t_bh,
4162                     bool preserve)
4163{
4164    int ret;
4165    struct buffer_head *ref_root_bh = NULL;
4166    struct ocfs2_cached_dealloc_ctxt dealloc;
4167    struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4168    struct ocfs2_refcount_block *rb;
4169    struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
4170    struct ocfs2_refcount_tree *ref_tree;
4171
4172    ocfs2_init_dealloc_ctxt(&dealloc);
4173
4174    ret = ocfs2_set_refcount_tree(t_inode, t_bh,
4175                      le64_to_cpu(di->i_refcount_loc));
4176    if (ret) {
4177        mlog_errno(ret);
4178        goto out;
4179    }
4180
4181    if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4182        ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
4183                          t_inode, t_bh);
4184        if (ret)
4185            mlog_errno(ret);
4186        goto out;
4187    }
4188
4189    ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4190                       1, &ref_tree, &ref_root_bh);
4191    if (ret) {
4192        mlog_errno(ret);
4193        goto out;
4194    }
4195    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4196
4197    ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4198                      &ref_tree->rf_ci, ref_root_bh,
4199                      &dealloc);
4200    if (ret) {
4201        mlog_errno(ret);
4202        goto out_unlock_refcount;
4203    }
4204
4205out_unlock_refcount:
4206    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4207    brelse(ref_root_bh);
4208out:
4209    if (ocfs2_dealloc_has_cluster(&dealloc)) {
4210        ocfs2_schedule_truncate_log_flush(osb, 1);
4211        ocfs2_run_deallocs(osb, &dealloc);
4212    }
4213
4214    return ret;
4215}
4216
4217static int __ocfs2_reflink(struct dentry *old_dentry,
4218               struct buffer_head *old_bh,
4219               struct inode *new_inode,
4220               bool preserve)
4221{
4222    int ret;
4223    struct inode *inode = old_dentry->d_inode;
4224    struct buffer_head *new_bh = NULL;
4225
4226    if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
4227        ret = -EINVAL;
4228        mlog_errno(ret);
4229        goto out;
4230    }
4231
4232    ret = filemap_fdatawrite(inode->i_mapping);
4233    if (ret) {
4234        mlog_errno(ret);
4235        goto out;
4236    }
4237
4238    ret = ocfs2_attach_refcount_tree(inode, old_bh);
4239    if (ret) {
4240        mlog_errno(ret);
4241        goto out;
4242    }
4243
4244    mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
4245    ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
4246                      OI_LS_REFLINK_TARGET);
4247    if (ret) {
4248        mlog_errno(ret);
4249        goto out_unlock;
4250    }
4251
4252    ret = ocfs2_create_reflink_node(inode, old_bh,
4253                    new_inode, new_bh, preserve);
4254    if (ret) {
4255        mlog_errno(ret);
4256        goto inode_unlock;
4257    }
4258
4259    if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4260        ret = ocfs2_reflink_xattrs(inode, old_bh,
4261                       new_inode, new_bh,
4262                       preserve);
4263        if (ret) {
4264            mlog_errno(ret);
4265            goto inode_unlock;
4266        }
4267    }
4268
4269    ret = ocfs2_complete_reflink(inode, old_bh,
4270                     new_inode, new_bh, preserve);
4271    if (ret)
4272        mlog_errno(ret);
4273
4274inode_unlock:
4275    ocfs2_inode_unlock(new_inode, 1);
4276    brelse(new_bh);
4277out_unlock:
4278    mutex_unlock(&new_inode->i_mutex);
4279out:
4280    if (!ret) {
4281        ret = filemap_fdatawait(inode->i_mapping);
4282        if (ret)
4283            mlog_errno(ret);
4284    }
4285    return ret;
4286}
4287
4288static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4289             struct dentry *new_dentry, bool preserve)
4290{
4291    int error;
4292    struct inode *inode = old_dentry->d_inode;
4293    struct buffer_head *old_bh = NULL;
4294    struct inode *new_orphan_inode = NULL;
4295
4296    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4297        return -EOPNOTSUPP;
4298
4299    error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4300                         &new_orphan_inode);
4301    if (error) {
4302        mlog_errno(error);
4303        goto out;
4304    }
4305
4306    error = ocfs2_inode_lock(inode, &old_bh, 1);
4307    if (error) {
4308        mlog_errno(error);
4309        goto out;
4310    }
4311
4312    down_write(&OCFS2_I(inode)->ip_xattr_sem);
4313    down_write(&OCFS2_I(inode)->ip_alloc_sem);
4314    error = __ocfs2_reflink(old_dentry, old_bh,
4315                new_orphan_inode, preserve);
4316    up_write(&OCFS2_I(inode)->ip_alloc_sem);
4317    up_write(&OCFS2_I(inode)->ip_xattr_sem);
4318
4319    ocfs2_inode_unlock(inode, 1);
4320    brelse(old_bh);
4321
4322    if (error) {
4323        mlog_errno(error);
4324        goto out;
4325    }
4326
4327    /* If the security isn't preserved, we need to re-initialize them. */
4328    if (!preserve) {
4329        error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
4330                            &new_dentry->d_name);
4331        if (error)
4332            mlog_errno(error);
4333    }
4334out:
4335    if (!error) {
4336        error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4337                               new_dentry);
4338        if (error)
4339            mlog_errno(error);
4340    }
4341
4342    if (new_orphan_inode) {
4343        /*
4344         * We need to open_unlock the inode no matter whether we
4345         * succeed or not, so that other nodes can delete it later.
4346         */
4347        ocfs2_open_unlock(new_orphan_inode);
4348        if (error)
4349            iput(new_orphan_inode);
4350    }
4351
4352    return error;
4353}
4354
4355/*
4356 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4357 * sys_reflink(). This will go away when vfs_reflink() exists in
4358 * fs/namei.c.
4359 */
4360
4361/* copied from may_create in VFS. */
4362static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4363{
4364    if (child->d_inode)
4365        return -EEXIST;
4366    if (IS_DEADDIR(dir))
4367        return -ENOENT;
4368    return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4369}
4370
4371/* copied from user_path_parent. */
4372static int ocfs2_user_path_parent(const char __user *path,
4373                  struct nameidata *nd, char **name)
4374{
4375    char *s = getname(path);
4376    int error;
4377
4378    if (IS_ERR(s))
4379        return PTR_ERR(s);
4380
4381    error = kern_path_parent(s, nd);
4382    if (error)
4383        putname(s);
4384    else
4385        *name = s;
4386
4387    return error;
4388}
4389
4390/**
4391 * ocfs2_vfs_reflink - Create a reference-counted link
4392 *
4393 * @old_dentry: source dentry + inode
4394 * @dir: directory to create the target
4395 * @new_dentry: target dentry
4396 * @preserve: if true, preserve all file attributes
4397 */
4398static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4399                 struct dentry *new_dentry, bool preserve)
4400{
4401    struct inode *inode = old_dentry->d_inode;
4402    int error;
4403
4404    if (!inode)
4405        return -ENOENT;
4406
4407    error = ocfs2_may_create(dir, new_dentry);
4408    if (error)
4409        return error;
4410
4411    if (dir->i_sb != inode->i_sb)
4412        return -EXDEV;
4413
4414    /*
4415     * A reflink to an append-only or immutable file cannot be created.
4416     */
4417    if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4418        return -EPERM;
4419
4420    /* Only regular files can be reflinked. */
4421    if (!S_ISREG(inode->i_mode))
4422        return -EPERM;
4423
4424    /*
4425     * If the caller wants to preserve ownership, they require the
4426     * rights to do so.
4427     */
4428    if (preserve) {
4429        if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
4430            return -EPERM;
4431        if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4432            return -EPERM;
4433    }
4434
4435    /*
4436     * If the caller is modifying any aspect of the attributes, they
4437     * are not creating a snapshot. They need read permission on the
4438     * file.
4439     */
4440    if (!preserve) {
4441        error = inode_permission(inode, MAY_READ);
4442        if (error)
4443            return error;
4444    }
4445
4446    mutex_lock(&inode->i_mutex);
4447    dquot_initialize(dir);
4448    error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4449    mutex_unlock(&inode->i_mutex);
4450    if (!error)
4451        fsnotify_create(dir, new_dentry);
4452    return error;
4453}
4454/*
4455 * Most codes are copied from sys_linkat.
4456 */
4457int ocfs2_reflink_ioctl(struct inode *inode,
4458            const char __user *oldname,
4459            const char __user *newname,
4460            bool preserve)
4461{
4462    struct dentry *new_dentry;
4463    struct nameidata nd;
4464    struct path old_path;
4465    int error;
4466    char *to = NULL;
4467
4468    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4469        return -EOPNOTSUPP;
4470
4471    error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4472    if (error) {
4473        mlog_errno(error);
4474        return error;
4475    }
4476
4477    error = ocfs2_user_path_parent(newname, &nd, &to);
4478    if (error) {
4479        mlog_errno(error);
4480        goto out;
4481    }
4482
4483    error = -EXDEV;
4484    if (old_path.mnt != nd.path.mnt)
4485        goto out_release;
4486    new_dentry = lookup_create(&nd, 0);
4487    error = PTR_ERR(new_dentry);
4488    if (IS_ERR(new_dentry)) {
4489        mlog_errno(error);
4490        goto out_unlock;
4491    }
4492
4493    error = mnt_want_write(nd.path.mnt);
4494    if (error) {
4495        mlog_errno(error);
4496        goto out_dput;
4497    }
4498
4499    error = ocfs2_vfs_reflink(old_path.dentry,
4500                  nd.path.dentry->d_inode,
4501                  new_dentry, preserve);
4502    mnt_drop_write(nd.path.mnt);
4503out_dput:
4504    dput(new_dentry);
4505out_unlock:
4506    mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
4507out_release:
4508    path_put(&nd.path);
4509    putname(to);
4510out:
4511    path_put(&old_path);
4512
4513    return error;
4514}
4515

Archive Download this file



interactive