Root/fs/ocfs2/refcounttree.c

1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.c
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#include <linux/sort.h>
19#define MLOG_MASK_PREFIX ML_REFCOUNT
20#include <cluster/masklog.h>
21#include "ocfs2.h"
22#include "inode.h"
23#include "alloc.h"
24#include "suballoc.h"
25#include "journal.h"
26#include "uptodate.h"
27#include "super.h"
28#include "buffer_head_io.h"
29#include "blockcheck.h"
30#include "refcounttree.h"
31#include "sysfile.h"
32#include "dlmglue.h"
33#include "extent_map.h"
34#include "aops.h"
35#include "xattr.h"
36#include "namei.h"
37
38#include <linux/bio.h>
39#include <linux/blkdev.h>
40#include <linux/slab.h>
41#include <linux/writeback.h>
42#include <linux/pagevec.h>
43#include <linux/swap.h>
44#include <linux/security.h>
45#include <linux/fsnotify.h>
46#include <linux/quotaops.h>
47#include <linux/namei.h>
48#include <linux/mount.h>
49
50struct ocfs2_cow_context {
51    struct inode *inode;
52    u32 cow_start;
53    u32 cow_len;
54    struct ocfs2_extent_tree data_et;
55    struct ocfs2_refcount_tree *ref_tree;
56    struct buffer_head *ref_root_bh;
57    struct ocfs2_alloc_context *meta_ac;
58    struct ocfs2_alloc_context *data_ac;
59    struct ocfs2_cached_dealloc_ctxt dealloc;
60    void *cow_object;
61    struct ocfs2_post_refcount *post_refcount;
62    int extra_credits;
63    int (*get_clusters)(struct ocfs2_cow_context *context,
64                u32 v_cluster, u32 *p_cluster,
65                u32 *num_clusters,
66                unsigned int *extent_flags);
67    int (*cow_duplicate_clusters)(handle_t *handle,
68                      struct ocfs2_cow_context *context,
69                      u32 cpos, u32 old_cluster,
70                      u32 new_cluster, u32 new_len);
71};
72
73static inline struct ocfs2_refcount_tree *
74cache_info_to_refcount(struct ocfs2_caching_info *ci)
75{
76    return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
77}
78
79static int ocfs2_validate_refcount_block(struct super_block *sb,
80                     struct buffer_head *bh)
81{
82    int rc;
83    struct ocfs2_refcount_block *rb =
84        (struct ocfs2_refcount_block *)bh->b_data;
85
86    mlog(0, "Validating refcount block %llu\n",
87         (unsigned long long)bh->b_blocknr);
88
89    BUG_ON(!buffer_uptodate(bh));
90
91    /*
92     * If the ecc fails, we return the error but otherwise
93     * leave the filesystem running. We know any error is
94     * local to this block.
95     */
96    rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
97    if (rc) {
98        mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
99             (unsigned long long)bh->b_blocknr);
100        return rc;
101    }
102
103
104    if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
105        ocfs2_error(sb,
106                "Refcount block #%llu has bad signature %.*s",
107                (unsigned long long)bh->b_blocknr, 7,
108                rb->rf_signature);
109        return -EINVAL;
110    }
111
112    if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
113        ocfs2_error(sb,
114                "Refcount block #%llu has an invalid rf_blkno "
115                "of %llu",
116                (unsigned long long)bh->b_blocknr,
117                (unsigned long long)le64_to_cpu(rb->rf_blkno));
118        return -EINVAL;
119    }
120
121    if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
122        ocfs2_error(sb,
123                "Refcount block #%llu has an invalid "
124                "rf_fs_generation of #%u",
125                (unsigned long long)bh->b_blocknr,
126                le32_to_cpu(rb->rf_fs_generation));
127        return -EINVAL;
128    }
129
130    return 0;
131}
132
133static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
134                     u64 rb_blkno,
135                     struct buffer_head **bh)
136{
137    int rc;
138    struct buffer_head *tmp = *bh;
139
140    rc = ocfs2_read_block(ci, rb_blkno, &tmp,
141                  ocfs2_validate_refcount_block);
142
143    /* If ocfs2_read_block() got us a new bh, pass it up. */
144    if (!rc && !*bh)
145        *bh = tmp;
146
147    return rc;
148}
149
150static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
151{
152    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
153
154    return rf->rf_blkno;
155}
156
157static struct super_block *
158ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
159{
160    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
161
162    return rf->rf_sb;
163}
164
165static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
166{
167    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
168
169    spin_lock(&rf->rf_lock);
170}
171
172static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
173{
174    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
175
176    spin_unlock(&rf->rf_lock);
177}
178
179static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
180{
181    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
182
183    mutex_lock(&rf->rf_io_mutex);
184}
185
186static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
187{
188    struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
189
190    mutex_unlock(&rf->rf_io_mutex);
191}
192
193static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
194    .co_owner = ocfs2_refcount_cache_owner,
195    .co_get_super = ocfs2_refcount_cache_get_super,
196    .co_cache_lock = ocfs2_refcount_cache_lock,
197    .co_cache_unlock = ocfs2_refcount_cache_unlock,
198    .co_io_lock = ocfs2_refcount_cache_io_lock,
199    .co_io_unlock = ocfs2_refcount_cache_io_unlock,
200};
201
202static struct ocfs2_refcount_tree *
203ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
204{
205    struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
206    struct ocfs2_refcount_tree *tree = NULL;
207
208    while (n) {
209        tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
210
211        if (blkno < tree->rf_blkno)
212            n = n->rb_left;
213        else if (blkno > tree->rf_blkno)
214            n = n->rb_right;
215        else
216            return tree;
217    }
218
219    return NULL;
220}
221
222/* osb_lock is already locked. */
223static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
224                       struct ocfs2_refcount_tree *new)
225{
226    u64 rf_blkno = new->rf_blkno;
227    struct rb_node *parent = NULL;
228    struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
229    struct ocfs2_refcount_tree *tmp;
230
231    while (*p) {
232        parent = *p;
233
234        tmp = rb_entry(parent, struct ocfs2_refcount_tree,
235                   rf_node);
236
237        if (rf_blkno < tmp->rf_blkno)
238            p = &(*p)->rb_left;
239        else if (rf_blkno > tmp->rf_blkno)
240            p = &(*p)->rb_right;
241        else {
242            /* This should never happen! */
243            mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
244                 (unsigned long long)rf_blkno);
245            BUG();
246        }
247    }
248
249    rb_link_node(&new->rf_node, parent, p);
250    rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
251}
252
253static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
254{
255    ocfs2_metadata_cache_exit(&tree->rf_ci);
256    ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
257    ocfs2_lock_res_free(&tree->rf_lockres);
258    kfree(tree);
259}
260
261static inline void
262ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
263                    struct ocfs2_refcount_tree *tree)
264{
265    rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
266    if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
267        osb->osb_ref_tree_lru = NULL;
268}
269
270static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
271                    struct ocfs2_refcount_tree *tree)
272{
273    spin_lock(&osb->osb_lock);
274    ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
275    spin_unlock(&osb->osb_lock);
276}
277
278static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
279{
280    struct ocfs2_refcount_tree *tree =
281        container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
282
283    ocfs2_free_refcount_tree(tree);
284}
285
286static inline void
287ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
288{
289    kref_get(&tree->rf_getcnt);
290}
291
292static inline void
293ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
294{
295    kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
296}
297
298static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
299                           struct super_block *sb)
300{
301    ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
302    mutex_init(&new->rf_io_mutex);
303    new->rf_sb = sb;
304    spin_lock_init(&new->rf_lock);
305}
306
307static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
308                    struct ocfs2_refcount_tree *new,
309                    u64 rf_blkno, u32 generation)
310{
311    init_rwsem(&new->rf_sem);
312    ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
313                     rf_blkno, generation);
314}
315
316static struct ocfs2_refcount_tree*
317ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
318{
319    struct ocfs2_refcount_tree *new;
320
321    new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
322    if (!new)
323        return NULL;
324
325    new->rf_blkno = rf_blkno;
326    kref_init(&new->rf_getcnt);
327    ocfs2_init_refcount_tree_ci(new, osb->sb);
328
329    return new;
330}
331
332static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
333                   struct ocfs2_refcount_tree **ret_tree)
334{
335    int ret = 0;
336    struct ocfs2_refcount_tree *tree, *new = NULL;
337    struct buffer_head *ref_root_bh = NULL;
338    struct ocfs2_refcount_block *ref_rb;
339
340    spin_lock(&osb->osb_lock);
341    if (osb->osb_ref_tree_lru &&
342        osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
343        tree = osb->osb_ref_tree_lru;
344    else
345        tree = ocfs2_find_refcount_tree(osb, rf_blkno);
346    if (tree)
347        goto out;
348
349    spin_unlock(&osb->osb_lock);
350
351    new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
352    if (!new) {
353        ret = -ENOMEM;
354        mlog_errno(ret);
355        return ret;
356    }
357    /*
358     * We need the generation to create the refcount tree lock and since
359     * it isn't changed during the tree modification, we are safe here to
360     * read without protection.
361     * We also have to purge the cache after we create the lock since the
362     * refcount block may have the stale data. It can only be trusted when
363     * we hold the refcount lock.
364     */
365    ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
366    if (ret) {
367        mlog_errno(ret);
368        ocfs2_metadata_cache_exit(&new->rf_ci);
369        kfree(new);
370        return ret;
371    }
372
373    ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
374    new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
375    ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
376                      new->rf_generation);
377    ocfs2_metadata_cache_purge(&new->rf_ci);
378
379    spin_lock(&osb->osb_lock);
380    tree = ocfs2_find_refcount_tree(osb, rf_blkno);
381    if (tree)
382        goto out;
383
384    ocfs2_insert_refcount_tree(osb, new);
385
386    tree = new;
387    new = NULL;
388
389out:
390    *ret_tree = tree;
391
392    osb->osb_ref_tree_lru = tree;
393
394    spin_unlock(&osb->osb_lock);
395
396    if (new)
397        ocfs2_free_refcount_tree(new);
398
399    brelse(ref_root_bh);
400    return ret;
401}
402
403static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
404{
405    int ret;
406    struct buffer_head *di_bh = NULL;
407    struct ocfs2_dinode *di;
408
409    ret = ocfs2_read_inode_block(inode, &di_bh);
410    if (ret) {
411        mlog_errno(ret);
412        goto out;
413    }
414
415    BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
416
417    di = (struct ocfs2_dinode *)di_bh->b_data;
418    *ref_blkno = le64_to_cpu(di->i_refcount_loc);
419    brelse(di_bh);
420out:
421    return ret;
422}
423
424static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
425                      struct ocfs2_refcount_tree *tree, int rw)
426{
427    int ret;
428
429    ret = ocfs2_refcount_lock(tree, rw);
430    if (ret) {
431        mlog_errno(ret);
432        goto out;
433    }
434
435    if (rw)
436        down_write(&tree->rf_sem);
437    else
438        down_read(&tree->rf_sem);
439
440out:
441    return ret;
442}
443
444/*
445 * Lock the refcount tree pointed by ref_blkno and return the tree.
446 * In most case, we lock the tree and read the refcount block.
447 * So read it here if the caller really needs it.
448 *
449 * If the tree has been re-created by other node, it will free the
450 * old one and re-create it.
451 */
452int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
453                 u64 ref_blkno, int rw,
454                 struct ocfs2_refcount_tree **ret_tree,
455                 struct buffer_head **ref_bh)
456{
457    int ret, delete_tree = 0;
458    struct ocfs2_refcount_tree *tree = NULL;
459    struct buffer_head *ref_root_bh = NULL;
460    struct ocfs2_refcount_block *rb;
461
462again:
463    ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
464    if (ret) {
465        mlog_errno(ret);
466        return ret;
467    }
468
469    ocfs2_refcount_tree_get(tree);
470
471    ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
472    if (ret) {
473        mlog_errno(ret);
474        ocfs2_refcount_tree_put(tree);
475        goto out;
476    }
477
478    ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
479                    &ref_root_bh);
480    if (ret) {
481        mlog_errno(ret);
482        ocfs2_unlock_refcount_tree(osb, tree, rw);
483        ocfs2_refcount_tree_put(tree);
484        goto out;
485    }
486
487    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
488    /*
489     * If the refcount block has been freed and re-created, we may need
490     * to recreate the refcount tree also.
491     *
492     * Here we just remove the tree from the rb-tree, and the last
493     * kref holder will unlock and delete this refcount_tree.
494     * Then we goto "again" and ocfs2_get_refcount_tree will create
495     * the new refcount tree for us.
496     */
497    if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
498        if (!tree->rf_removed) {
499            ocfs2_erase_refcount_tree_from_list(osb, tree);
500            tree->rf_removed = 1;
501            delete_tree = 1;
502        }
503
504        ocfs2_unlock_refcount_tree(osb, tree, rw);
505        /*
506         * We get an extra reference when we create the refcount
507         * tree, so another put will destroy it.
508         */
509        if (delete_tree)
510            ocfs2_refcount_tree_put(tree);
511        brelse(ref_root_bh);
512        ref_root_bh = NULL;
513        goto again;
514    }
515
516    *ret_tree = tree;
517    if (ref_bh) {
518        *ref_bh = ref_root_bh;
519        ref_root_bh = NULL;
520    }
521out:
522    brelse(ref_root_bh);
523    return ret;
524}
525
526void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
527                struct ocfs2_refcount_tree *tree, int rw)
528{
529    if (rw)
530        up_write(&tree->rf_sem);
531    else
532        up_read(&tree->rf_sem);
533
534    ocfs2_refcount_unlock(tree, rw);
535    ocfs2_refcount_tree_put(tree);
536}
537
538void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
539{
540    struct rb_node *node;
541    struct ocfs2_refcount_tree *tree;
542    struct rb_root *root = &osb->osb_rf_lock_tree;
543
544    while ((node = rb_last(root)) != NULL) {
545        tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
546
547        mlog(0, "Purge tree %llu\n",
548             (unsigned long long) tree->rf_blkno);
549
550        rb_erase(&tree->rf_node, root);
551        ocfs2_free_refcount_tree(tree);
552    }
553}
554
555/*
556 * Create a refcount tree for an inode.
557 * We take for granted that the inode is already locked.
558 */
559static int ocfs2_create_refcount_tree(struct inode *inode,
560                      struct buffer_head *di_bh)
561{
562    int ret;
563    handle_t *handle = NULL;
564    struct ocfs2_alloc_context *meta_ac = NULL;
565    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
566    struct ocfs2_inode_info *oi = OCFS2_I(inode);
567    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
568    struct buffer_head *new_bh = NULL;
569    struct ocfs2_refcount_block *rb;
570    struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
571    u16 suballoc_bit_start;
572    u32 num_got;
573    u64 first_blkno;
574
575    BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
576
577    mlog(0, "create tree for inode %lu\n", inode->i_ino);
578
579    ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
580    if (ret) {
581        mlog_errno(ret);
582        goto out;
583    }
584
585    handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
586    if (IS_ERR(handle)) {
587        ret = PTR_ERR(handle);
588        mlog_errno(ret);
589        goto out;
590    }
591
592    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
593                      OCFS2_JOURNAL_ACCESS_WRITE);
594    if (ret) {
595        mlog_errno(ret);
596        goto out_commit;
597    }
598
599    ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
600                   &suballoc_bit_start, &num_got,
601                   &first_blkno);
602    if (ret) {
603        mlog_errno(ret);
604        goto out_commit;
605    }
606
607    new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
608    if (!new_tree) {
609        ret = -ENOMEM;
610        mlog_errno(ret);
611        goto out_commit;
612    }
613
614    new_bh = sb_getblk(inode->i_sb, first_blkno);
615    ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
616
617    ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
618                      OCFS2_JOURNAL_ACCESS_CREATE);
619    if (ret) {
620        mlog_errno(ret);
621        goto out_commit;
622    }
623
624    /* Initialize ocfs2_refcount_block. */
625    rb = (struct ocfs2_refcount_block *)new_bh->b_data;
626    memset(rb, 0, inode->i_sb->s_blocksize);
627    strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
628    rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
629    rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
630    rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
631    rb->rf_blkno = cpu_to_le64(first_blkno);
632    rb->rf_count = cpu_to_le32(1);
633    rb->rf_records.rl_count =
634            cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
635    spin_lock(&osb->osb_lock);
636    rb->rf_generation = osb->s_next_generation++;
637    spin_unlock(&osb->osb_lock);
638
639    ocfs2_journal_dirty(handle, new_bh);
640
641    spin_lock(&oi->ip_lock);
642    oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
643    di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
644    di->i_refcount_loc = cpu_to_le64(first_blkno);
645    spin_unlock(&oi->ip_lock);
646
647    mlog(0, "created tree for inode %lu, refblock %llu\n",
648         inode->i_ino, (unsigned long long)first_blkno);
649
650    ocfs2_journal_dirty(handle, di_bh);
651
652    /*
653     * We have to init the tree lock here since it will use
654     * the generation number to create it.
655     */
656    new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
657    ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
658                      new_tree->rf_generation);
659
660    spin_lock(&osb->osb_lock);
661    tree = ocfs2_find_refcount_tree(osb, first_blkno);
662
663    /*
664     * We've just created a new refcount tree in this block. If
665     * we found a refcount tree on the ocfs2_super, it must be
666     * one we just deleted. We free the old tree before
667     * inserting the new tree.
668     */
669    BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
670    if (tree)
671        ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
672    ocfs2_insert_refcount_tree(osb, new_tree);
673    spin_unlock(&osb->osb_lock);
674    new_tree = NULL;
675    if (tree)
676        ocfs2_refcount_tree_put(tree);
677
678out_commit:
679    ocfs2_commit_trans(osb, handle);
680
681out:
682    if (new_tree) {
683        ocfs2_metadata_cache_exit(&new_tree->rf_ci);
684        kfree(new_tree);
685    }
686
687    brelse(new_bh);
688    if (meta_ac)
689        ocfs2_free_alloc_context(meta_ac);
690
691    return ret;
692}
693
694static int ocfs2_set_refcount_tree(struct inode *inode,
695                   struct buffer_head *di_bh,
696                   u64 refcount_loc)
697{
698    int ret;
699    handle_t *handle = NULL;
700    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
701    struct ocfs2_inode_info *oi = OCFS2_I(inode);
702    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
703    struct buffer_head *ref_root_bh = NULL;
704    struct ocfs2_refcount_block *rb;
705    struct ocfs2_refcount_tree *ref_tree;
706
707    BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
708
709    ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
710                       &ref_tree, &ref_root_bh);
711    if (ret) {
712        mlog_errno(ret);
713        return ret;
714    }
715
716    handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
717    if (IS_ERR(handle)) {
718        ret = PTR_ERR(handle);
719        mlog_errno(ret);
720        goto out;
721    }
722
723    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
724                      OCFS2_JOURNAL_ACCESS_WRITE);
725    if (ret) {
726        mlog_errno(ret);
727        goto out_commit;
728    }
729
730    ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
731                      OCFS2_JOURNAL_ACCESS_WRITE);
732    if (ret) {
733        mlog_errno(ret);
734        goto out_commit;
735    }
736
737    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
738    le32_add_cpu(&rb->rf_count, 1);
739
740    ocfs2_journal_dirty(handle, ref_root_bh);
741
742    spin_lock(&oi->ip_lock);
743    oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
744    di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
745    di->i_refcount_loc = cpu_to_le64(refcount_loc);
746    spin_unlock(&oi->ip_lock);
747    ocfs2_journal_dirty(handle, di_bh);
748
749out_commit:
750    ocfs2_commit_trans(osb, handle);
751out:
752    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
753    brelse(ref_root_bh);
754
755    return ret;
756}
757
758int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
759{
760    int ret, delete_tree = 0;
761    handle_t *handle = NULL;
762    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
763    struct ocfs2_inode_info *oi = OCFS2_I(inode);
764    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
765    struct ocfs2_refcount_block *rb;
766    struct inode *alloc_inode = NULL;
767    struct buffer_head *alloc_bh = NULL;
768    struct buffer_head *blk_bh = NULL;
769    struct ocfs2_refcount_tree *ref_tree;
770    int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
771    u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
772    u16 bit = 0;
773
774    if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
775        return 0;
776
777    BUG_ON(!ref_blkno);
778    ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
779    if (ret) {
780        mlog_errno(ret);
781        return ret;
782    }
783
784    rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
785
786    /*
787     * If we are the last user, we need to free the block.
788     * So lock the allocator ahead.
789     */
790    if (le32_to_cpu(rb->rf_count) == 1) {
791        blk = le64_to_cpu(rb->rf_blkno);
792        bit = le16_to_cpu(rb->rf_suballoc_bit);
793        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
794
795        alloc_inode = ocfs2_get_system_file_inode(osb,
796                    EXTENT_ALLOC_SYSTEM_INODE,
797                    le16_to_cpu(rb->rf_suballoc_slot));
798        if (!alloc_inode) {
799            ret = -ENOMEM;
800            mlog_errno(ret);
801            goto out;
802        }
803        mutex_lock(&alloc_inode->i_mutex);
804
805        ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
806        if (ret) {
807            mlog_errno(ret);
808            goto out_mutex;
809        }
810
811        credits += OCFS2_SUBALLOC_FREE;
812    }
813
814    handle = ocfs2_start_trans(osb, credits);
815    if (IS_ERR(handle)) {
816        ret = PTR_ERR(handle);
817        mlog_errno(ret);
818        goto out_unlock;
819    }
820
821    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
822                      OCFS2_JOURNAL_ACCESS_WRITE);
823    if (ret) {
824        mlog_errno(ret);
825        goto out_commit;
826    }
827
828    ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
829                      OCFS2_JOURNAL_ACCESS_WRITE);
830    if (ret) {
831        mlog_errno(ret);
832        goto out_commit;
833    }
834
835    spin_lock(&oi->ip_lock);
836    oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
837    di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
838    di->i_refcount_loc = 0;
839    spin_unlock(&oi->ip_lock);
840    ocfs2_journal_dirty(handle, di_bh);
841
842    le32_add_cpu(&rb->rf_count , -1);
843    ocfs2_journal_dirty(handle, blk_bh);
844
845    if (!rb->rf_count) {
846        delete_tree = 1;
847        ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
848        ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
849                           alloc_bh, bit, bg_blkno, 1);
850        if (ret)
851            mlog_errno(ret);
852    }
853
854out_commit:
855    ocfs2_commit_trans(osb, handle);
856out_unlock:
857    if (alloc_inode) {
858        ocfs2_inode_unlock(alloc_inode, 1);
859        brelse(alloc_bh);
860    }
861out_mutex:
862    if (alloc_inode) {
863        mutex_unlock(&alloc_inode->i_mutex);
864        iput(alloc_inode);
865    }
866out:
867    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
868    if (delete_tree)
869        ocfs2_refcount_tree_put(ref_tree);
870    brelse(blk_bh);
871
872    return ret;
873}
874
875static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
876                      struct buffer_head *ref_leaf_bh,
877                      u64 cpos, unsigned int len,
878                      struct ocfs2_refcount_rec *ret_rec,
879                      int *index)
880{
881    int i = 0;
882    struct ocfs2_refcount_block *rb =
883        (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
884    struct ocfs2_refcount_rec *rec = NULL;
885
886    for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
887        rec = &rb->rf_records.rl_recs[i];
888
889        if (le64_to_cpu(rec->r_cpos) +
890            le32_to_cpu(rec->r_clusters) <= cpos)
891            continue;
892        else if (le64_to_cpu(rec->r_cpos) > cpos)
893            break;
894
895        /* ok, cpos fail in this rec. Just return. */
896        if (ret_rec)
897            *ret_rec = *rec;
898        goto out;
899    }
900
901    if (ret_rec) {
902        /* We meet with a hole here, so fake the rec. */
903        ret_rec->r_cpos = cpu_to_le64(cpos);
904        ret_rec->r_refcount = 0;
905        if (i < le16_to_cpu(rb->rf_records.rl_used) &&
906            le64_to_cpu(rec->r_cpos) < cpos + len)
907            ret_rec->r_clusters =
908                cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
909        else
910            ret_rec->r_clusters = cpu_to_le32(len);
911    }
912
913out:
914    *index = i;
915}
916
917/*
918 * Try to remove refcount tree. The mechanism is:
919 * 1) Check whether i_clusters == 0, if no, exit.
920 * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
921 * 3) Check whether we have inline xattr stored outside, if yes, exit.
922 * 4) Remove the tree.
923 */
924int ocfs2_try_remove_refcount_tree(struct inode *inode,
925                   struct buffer_head *di_bh)
926{
927    int ret;
928    struct ocfs2_inode_info *oi = OCFS2_I(inode);
929    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
930
931    down_write(&oi->ip_xattr_sem);
932    down_write(&oi->ip_alloc_sem);
933
934    if (oi->ip_clusters)
935        goto out;
936
937    if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
938        goto out;
939
940    if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
941        ocfs2_has_inline_xattr_value_outside(inode, di))
942        goto out;
943
944    ret = ocfs2_remove_refcount_tree(inode, di_bh);
945    if (ret)
946        mlog_errno(ret);
947out:
948    up_write(&oi->ip_alloc_sem);
949    up_write(&oi->ip_xattr_sem);
950    return 0;
951}
952
953/*
954 * Find the end range for a leaf refcount block indicated by
955 * el->l_recs[index].e_blkno.
956 */
957static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
958                       struct buffer_head *ref_root_bh,
959                       struct ocfs2_extent_block *eb,
960                       struct ocfs2_extent_list *el,
961                       int index, u32 *cpos_end)
962{
963    int ret, i, subtree_root;
964    u32 cpos;
965    u64 blkno;
966    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
967    struct ocfs2_path *left_path = NULL, *right_path = NULL;
968    struct ocfs2_extent_tree et;
969    struct ocfs2_extent_list *tmp_el;
970
971    if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
972        /*
973         * We have a extent rec after index, so just use the e_cpos
974         * of the next extent rec.
975         */
976        *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
977        return 0;
978    }
979
980    if (!eb || (eb && !eb->h_next_leaf_blk)) {
981        /*
982         * We are the last extent rec, so any high cpos should
983         * be stored in this leaf refcount block.
984         */
985        *cpos_end = UINT_MAX;
986        return 0;
987    }
988
989    /*
990     * If the extent block isn't the last one, we have to find
991     * the subtree root between this extent block and the next
992     * leaf extent block and get the corresponding e_cpos from
993     * the subroot. Otherwise we may corrupt the b-tree.
994     */
995    ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
996
997    left_path = ocfs2_new_path_from_et(&et);
998    if (!left_path) {
999        ret = -ENOMEM;
1000        mlog_errno(ret);
1001        goto out;
1002    }
1003
1004    cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
1005    ret = ocfs2_find_path(ci, left_path, cpos);
1006    if (ret) {
1007        mlog_errno(ret);
1008        goto out;
1009    }
1010
1011    right_path = ocfs2_new_path_from_path(left_path);
1012    if (!right_path) {
1013        ret = -ENOMEM;
1014        mlog_errno(ret);
1015        goto out;
1016    }
1017
1018    ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
1019    if (ret) {
1020        mlog_errno(ret);
1021        goto out;
1022    }
1023
1024    ret = ocfs2_find_path(ci, right_path, cpos);
1025    if (ret) {
1026        mlog_errno(ret);
1027        goto out;
1028    }
1029
1030    subtree_root = ocfs2_find_subtree_root(&et, left_path,
1031                           right_path);
1032
1033    tmp_el = left_path->p_node[subtree_root].el;
1034    blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
1035    for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
1036        if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
1037            *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
1038            break;
1039        }
1040    }
1041
1042    BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
1043
1044out:
1045    ocfs2_free_path(left_path);
1046    ocfs2_free_path(right_path);
1047    return ret;
1048}
1049
1050/*
1051 * Given a cpos and len, try to find the refcount record which contains cpos.
1052 * 1. If cpos can be found in one refcount record, return the record.
1053 * 2. If cpos can't be found, return a fake record which start from cpos
1054 * and end at a small value between cpos+len and start of the next record.
1055 * This fake record has r_refcount = 0.
1056 */
1057static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
1058                  struct buffer_head *ref_root_bh,
1059                  u64 cpos, unsigned int len,
1060                  struct ocfs2_refcount_rec *ret_rec,
1061                  int *index,
1062                  struct buffer_head **ret_bh)
1063{
1064    int ret = 0, i, found;
1065    u32 low_cpos, uninitialized_var(cpos_end);
1066    struct ocfs2_extent_list *el;
1067    struct ocfs2_extent_rec *rec = NULL;
1068    struct ocfs2_extent_block *eb = NULL;
1069    struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
1070    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1071    struct ocfs2_refcount_block *rb =
1072            (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1073
1074    if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
1075        ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
1076                          ret_rec, index);
1077        *ret_bh = ref_root_bh;
1078        get_bh(ref_root_bh);
1079        return 0;
1080    }
1081
1082    el = &rb->rf_list;
1083    low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1084
1085    if (el->l_tree_depth) {
1086        ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1087        if (ret) {
1088            mlog_errno(ret);
1089            goto out;
1090        }
1091
1092        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1093        el = &eb->h_list;
1094
1095        if (el->l_tree_depth) {
1096            ocfs2_error(sb,
1097            "refcount tree %llu has non zero tree "
1098            "depth in leaf btree tree block %llu\n",
1099            (unsigned long long)ocfs2_metadata_cache_owner(ci),
1100            (unsigned long long)eb_bh->b_blocknr);
1101            ret = -EROFS;
1102            goto out;
1103        }
1104    }
1105
1106    found = 0;
1107    for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1108        rec = &el->l_recs[i];
1109
1110        if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1111            found = 1;
1112            break;
1113        }
1114    }
1115
1116    if (found) {
1117        ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
1118                          eb, el, i, &cpos_end);
1119        if (ret) {
1120            mlog_errno(ret);
1121            goto out;
1122        }
1123
1124        if (cpos_end < low_cpos + len)
1125            len = cpos_end - low_cpos;
1126    }
1127
1128    ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1129                    &ref_leaf_bh);
1130    if (ret) {
1131        mlog_errno(ret);
1132        goto out;
1133    }
1134
1135    ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1136                      ret_rec, index);
1137    *ret_bh = ref_leaf_bh;
1138out:
1139    brelse(eb_bh);
1140    return ret;
1141}
1142
1143enum ocfs2_ref_rec_contig {
1144    REF_CONTIG_NONE = 0,
1145    REF_CONTIG_LEFT,
1146    REF_CONTIG_RIGHT,
1147    REF_CONTIG_LEFTRIGHT,
1148};
1149
1150static enum ocfs2_ref_rec_contig
1151    ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1152                    int index)
1153{
1154    if ((rb->rf_records.rl_recs[index].r_refcount ==
1155        rb->rf_records.rl_recs[index + 1].r_refcount) &&
1156        (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1157        le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1158        le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1159        return REF_CONTIG_RIGHT;
1160
1161    return REF_CONTIG_NONE;
1162}
1163
1164static enum ocfs2_ref_rec_contig
1165    ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1166                  int index)
1167{
1168    enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
1169
1170    if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1171        ret = ocfs2_refcount_rec_adjacent(rb, index);
1172
1173    if (index > 0) {
1174        enum ocfs2_ref_rec_contig tmp;
1175
1176        tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1177
1178        if (tmp == REF_CONTIG_RIGHT) {
1179            if (ret == REF_CONTIG_RIGHT)
1180                ret = REF_CONTIG_LEFTRIGHT;
1181            else
1182                ret = REF_CONTIG_LEFT;
1183        }
1184    }
1185
1186    return ret;
1187}
1188
1189static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1190                       int index)
1191{
1192    BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1193           rb->rf_records.rl_recs[index+1].r_refcount);
1194
1195    le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1196             le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1197
1198    if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1199        memmove(&rb->rf_records.rl_recs[index + 1],
1200            &rb->rf_records.rl_recs[index + 2],
1201            sizeof(struct ocfs2_refcount_rec) *
1202            (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1203
1204    memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1205           0, sizeof(struct ocfs2_refcount_rec));
1206    le16_add_cpu(&rb->rf_records.rl_used, -1);
1207}
1208
1209/*
1210 * Merge the refcount rec if we are contiguous with the adjacent recs.
1211 */
1212static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1213                     int index)
1214{
1215    enum ocfs2_ref_rec_contig contig =
1216                ocfs2_refcount_rec_contig(rb, index);
1217
1218    if (contig == REF_CONTIG_NONE)
1219        return;
1220
1221    if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1222        BUG_ON(index == 0);
1223        index--;
1224    }
1225
1226    ocfs2_rotate_refcount_rec_left(rb, index);
1227
1228    if (contig == REF_CONTIG_LEFTRIGHT)
1229        ocfs2_rotate_refcount_rec_left(rb, index);
1230}
1231
1232/*
1233 * Change the refcount indexed by "index" in ref_bh.
1234 * If refcount reaches 0, remove it.
1235 */
1236static int ocfs2_change_refcount_rec(handle_t *handle,
1237                     struct ocfs2_caching_info *ci,
1238                     struct buffer_head *ref_leaf_bh,
1239                     int index, int merge, int change)
1240{
1241    int ret;
1242    struct ocfs2_refcount_block *rb =
1243            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1244    struct ocfs2_refcount_list *rl = &rb->rf_records;
1245    struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1246
1247    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1248                      OCFS2_JOURNAL_ACCESS_WRITE);
1249    if (ret) {
1250        mlog_errno(ret);
1251        goto out;
1252    }
1253
1254    mlog(0, "change index %d, old count %u, change %d\n", index,
1255         le32_to_cpu(rec->r_refcount), change);
1256    le32_add_cpu(&rec->r_refcount, change);
1257
1258    if (!rec->r_refcount) {
1259        if (index != le16_to_cpu(rl->rl_used) - 1) {
1260            memmove(rec, rec + 1,
1261                (le16_to_cpu(rl->rl_used) - index - 1) *
1262                sizeof(struct ocfs2_refcount_rec));
1263            memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1264                   0, sizeof(struct ocfs2_refcount_rec));
1265        }
1266
1267        le16_add_cpu(&rl->rl_used, -1);
1268    } else if (merge)
1269        ocfs2_refcount_rec_merge(rb, index);
1270
1271    ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1272    if (ret)
1273        mlog_errno(ret);
1274out:
1275    return ret;
1276}
1277
1278static int ocfs2_expand_inline_ref_root(handle_t *handle,
1279                    struct ocfs2_caching_info *ci,
1280                    struct buffer_head *ref_root_bh,
1281                    struct buffer_head **ref_leaf_bh,
1282                    struct ocfs2_alloc_context *meta_ac)
1283{
1284    int ret;
1285    u16 suballoc_bit_start;
1286    u32 num_got;
1287    u64 blkno;
1288    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1289    struct buffer_head *new_bh = NULL;
1290    struct ocfs2_refcount_block *new_rb;
1291    struct ocfs2_refcount_block *root_rb =
1292            (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1293
1294    ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1295                      OCFS2_JOURNAL_ACCESS_WRITE);
1296    if (ret) {
1297        mlog_errno(ret);
1298        goto out;
1299    }
1300
1301    ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1302                   &suballoc_bit_start, &num_got,
1303                   &blkno);
1304    if (ret) {
1305        mlog_errno(ret);
1306        goto out;
1307    }
1308
1309    new_bh = sb_getblk(sb, blkno);
1310    if (new_bh == NULL) {
1311        ret = -EIO;
1312        mlog_errno(ret);
1313        goto out;
1314    }
1315    ocfs2_set_new_buffer_uptodate(ci, new_bh);
1316
1317    ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1318                      OCFS2_JOURNAL_ACCESS_CREATE);
1319    if (ret) {
1320        mlog_errno(ret);
1321        goto out;
1322    }
1323
1324    /*
1325     * Initialize ocfs2_refcount_block.
1326     * It should contain the same information as the old root.
1327     * so just memcpy it and change the corresponding field.
1328     */
1329    memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1330
1331    new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1332    new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1333    new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1334    new_rb->rf_blkno = cpu_to_le64(blkno);
1335    new_rb->rf_cpos = cpu_to_le32(0);
1336    new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1337    new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1338    ocfs2_journal_dirty(handle, new_bh);
1339
1340    /* Now change the root. */
1341    memset(&root_rb->rf_list, 0, sb->s_blocksize -
1342           offsetof(struct ocfs2_refcount_block, rf_list));
1343    root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1344    root_rb->rf_clusters = cpu_to_le32(1);
1345    root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1346    root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1347    root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1348    root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
1349
1350    ocfs2_journal_dirty(handle, ref_root_bh);
1351
1352    mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
1353         le16_to_cpu(new_rb->rf_records.rl_used));
1354
1355    *ref_leaf_bh = new_bh;
1356    new_bh = NULL;
1357out:
1358    brelse(new_bh);
1359    return ret;
1360}
1361
1362static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1363                       struct ocfs2_refcount_rec *next)
1364{
1365    if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1366        ocfs2_get_ref_rec_low_cpos(next))
1367        return 1;
1368
1369    return 0;
1370}
1371
1372static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1373{
1374    const struct ocfs2_refcount_rec *l = a, *r = b;
1375    u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1376    u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1377
1378    if (l_cpos > r_cpos)
1379        return 1;
1380    if (l_cpos < r_cpos)
1381        return -1;
1382    return 0;
1383}
1384
1385static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1386{
1387    const struct ocfs2_refcount_rec *l = a, *r = b;
1388    u64 l_cpos = le64_to_cpu(l->r_cpos);
1389    u64 r_cpos = le64_to_cpu(r->r_cpos);
1390
1391    if (l_cpos > r_cpos)
1392        return 1;
1393    if (l_cpos < r_cpos)
1394        return -1;
1395    return 0;
1396}
1397
1398static void swap_refcount_rec(void *a, void *b, int size)
1399{
1400    struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1401
1402    tmp = *(struct ocfs2_refcount_rec *)l;
1403    *(struct ocfs2_refcount_rec *)l =
1404            *(struct ocfs2_refcount_rec *)r;
1405    *(struct ocfs2_refcount_rec *)r = tmp;
1406}
1407
1408/*
1409 * The refcount cpos are ordered by their 64bit cpos,
1410 * But we will use the low 32 bit to be the e_cpos in the b-tree.
1411 * So we need to make sure that this pos isn't intersected with others.
1412 *
1413 * Note: The refcount block is already sorted by their low 32 bit cpos,
1414 * So just try the middle pos first, and we will exit when we find
1415 * the good position.
1416 */
1417static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1418                     u32 *split_pos, int *split_index)
1419{
1420    int num_used = le16_to_cpu(rl->rl_used);
1421    int delta, middle = num_used / 2;
1422
1423    for (delta = 0; delta < middle; delta++) {
1424        /* Let's check delta earlier than middle */
1425        if (ocfs2_refcount_rec_no_intersect(
1426                    &rl->rl_recs[middle - delta - 1],
1427                    &rl->rl_recs[middle - delta])) {
1428            *split_index = middle - delta;
1429            break;
1430        }
1431
1432        /* For even counts, don't walk off the end */
1433        if ((middle + delta + 1) == num_used)
1434            continue;
1435
1436        /* Now try delta past middle */
1437        if (ocfs2_refcount_rec_no_intersect(
1438                    &rl->rl_recs[middle + delta],
1439                    &rl->rl_recs[middle + delta + 1])) {
1440            *split_index = middle + delta + 1;
1441            break;
1442        }
1443    }
1444
1445    if (delta >= middle)
1446        return -ENOSPC;
1447
1448    *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1449    return 0;
1450}
1451
1452static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1453                        struct buffer_head *new_bh,
1454                        u32 *split_cpos)
1455{
1456    int split_index = 0, num_moved, ret;
1457    u32 cpos = 0;
1458    struct ocfs2_refcount_block *rb =
1459            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1460    struct ocfs2_refcount_list *rl = &rb->rf_records;
1461    struct ocfs2_refcount_block *new_rb =
1462            (struct ocfs2_refcount_block *)new_bh->b_data;
1463    struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1464
1465    mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
1466         (unsigned long long)ref_leaf_bh->b_blocknr,
1467         le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
1468
1469    /*
1470     * XXX: Improvement later.
1471     * If we know all the high 32 bit cpos is the same, no need to sort.
1472     *
1473     * In order to make the whole process safe, we do:
1474     * 1. sort the entries by their low 32 bit cpos first so that we can
1475     * find the split cpos easily.
1476     * 2. call ocfs2_insert_extent to insert the new refcount block.
1477     * 3. move the refcount rec to the new block.
1478     * 4. sort the entries by their 64 bit cpos.
1479     * 5. dirty the new_rb and rb.
1480     */
1481    sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1482         sizeof(struct ocfs2_refcount_rec),
1483         cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1484
1485    ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1486    if (ret) {
1487        mlog_errno(ret);
1488        return ret;
1489    }
1490
1491    new_rb->rf_cpos = cpu_to_le32(cpos);
1492
1493    /* move refcount records starting from split_index to the new block. */
1494    num_moved = le16_to_cpu(rl->rl_used) - split_index;
1495    memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1496           num_moved * sizeof(struct ocfs2_refcount_rec));
1497
1498    /*ok, remove the entries we just moved over to the other block. */
1499    memset(&rl->rl_recs[split_index], 0,
1500           num_moved * sizeof(struct ocfs2_refcount_rec));
1501
1502    /* change old and new rl_used accordingly. */
1503    le16_add_cpu(&rl->rl_used, -num_moved);
1504    new_rl->rl_used = cpu_to_le16(num_moved);
1505
1506    sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1507         sizeof(struct ocfs2_refcount_rec),
1508         cmp_refcount_rec_by_cpos, swap_refcount_rec);
1509
1510    sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1511         sizeof(struct ocfs2_refcount_rec),
1512         cmp_refcount_rec_by_cpos, swap_refcount_rec);
1513
1514    *split_cpos = cpos;
1515    return 0;
1516}
1517
1518static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1519                     struct ocfs2_caching_info *ci,
1520                     struct buffer_head *ref_root_bh,
1521                     struct buffer_head *ref_leaf_bh,
1522                     struct ocfs2_alloc_context *meta_ac)
1523{
1524    int ret;
1525    u16 suballoc_bit_start;
1526    u32 num_got, new_cpos;
1527    u64 blkno;
1528    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1529    struct ocfs2_refcount_block *root_rb =
1530            (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1531    struct buffer_head *new_bh = NULL;
1532    struct ocfs2_refcount_block *new_rb;
1533    struct ocfs2_extent_tree ref_et;
1534
1535    BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
1536
1537    ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1538                      OCFS2_JOURNAL_ACCESS_WRITE);
1539    if (ret) {
1540        mlog_errno(ret);
1541        goto out;
1542    }
1543
1544    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1545                      OCFS2_JOURNAL_ACCESS_WRITE);
1546    if (ret) {
1547        mlog_errno(ret);
1548        goto out;
1549    }
1550
1551    ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1552                   &suballoc_bit_start, &num_got,
1553                   &blkno);
1554    if (ret) {
1555        mlog_errno(ret);
1556        goto out;
1557    }
1558
1559    new_bh = sb_getblk(sb, blkno);
1560    if (new_bh == NULL) {
1561        ret = -EIO;
1562        mlog_errno(ret);
1563        goto out;
1564    }
1565    ocfs2_set_new_buffer_uptodate(ci, new_bh);
1566
1567    ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1568                      OCFS2_JOURNAL_ACCESS_CREATE);
1569    if (ret) {
1570        mlog_errno(ret);
1571        goto out;
1572    }
1573
1574    /* Initialize ocfs2_refcount_block. */
1575    new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1576    memset(new_rb, 0, sb->s_blocksize);
1577    strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1578    new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
1579    new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1580    new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1581    new_rb->rf_blkno = cpu_to_le64(blkno);
1582    new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1583    new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1584    new_rb->rf_records.rl_count =
1585                cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1586    new_rb->rf_generation = root_rb->rf_generation;
1587
1588    ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1589    if (ret) {
1590        mlog_errno(ret);
1591        goto out;
1592    }
1593
1594    ocfs2_journal_dirty(handle, ref_leaf_bh);
1595    ocfs2_journal_dirty(handle, new_bh);
1596
1597    ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1598
1599    mlog(0, "insert new leaf block %llu at %u\n",
1600         (unsigned long long)new_bh->b_blocknr, new_cpos);
1601
1602    /* Insert the new leaf block with the specific offset cpos. */
1603    ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1604                  1, 0, meta_ac);
1605    if (ret)
1606        mlog_errno(ret);
1607
1608out:
1609    brelse(new_bh);
1610    return ret;
1611}
1612
1613static int ocfs2_expand_refcount_tree(handle_t *handle,
1614                      struct ocfs2_caching_info *ci,
1615                      struct buffer_head *ref_root_bh,
1616                      struct buffer_head *ref_leaf_bh,
1617                      struct ocfs2_alloc_context *meta_ac)
1618{
1619    int ret;
1620    struct buffer_head *expand_bh = NULL;
1621
1622    if (ref_root_bh == ref_leaf_bh) {
1623        /*
1624         * the old root bh hasn't been expanded to a b-tree,
1625         * so expand it first.
1626         */
1627        ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1628                           &expand_bh, meta_ac);
1629        if (ret) {
1630            mlog_errno(ret);
1631            goto out;
1632        }
1633    } else {
1634        expand_bh = ref_leaf_bh;
1635        get_bh(expand_bh);
1636    }
1637
1638
1639    /* Now add a new refcount block into the tree.*/
1640    ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1641                        expand_bh, meta_ac);
1642    if (ret)
1643        mlog_errno(ret);
1644out:
1645    brelse(expand_bh);
1646    return ret;
1647}
1648
1649/*
1650 * Adjust the extent rec in b-tree representing ref_leaf_bh.
1651 *
1652 * Only called when we have inserted a new refcount rec at index 0
1653 * which means ocfs2_extent_rec.e_cpos may need some change.
1654 */
1655static int ocfs2_adjust_refcount_rec(handle_t *handle,
1656                     struct ocfs2_caching_info *ci,
1657                     struct buffer_head *ref_root_bh,
1658                     struct buffer_head *ref_leaf_bh,
1659                     struct ocfs2_refcount_rec *rec)
1660{
1661    int ret = 0, i;
1662    u32 new_cpos, old_cpos;
1663    struct ocfs2_path *path = NULL;
1664    struct ocfs2_extent_tree et;
1665    struct ocfs2_refcount_block *rb =
1666        (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1667    struct ocfs2_extent_list *el;
1668
1669    if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
1670        goto out;
1671
1672    rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1673    old_cpos = le32_to_cpu(rb->rf_cpos);
1674    new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1675    if (old_cpos <= new_cpos)
1676        goto out;
1677
1678    ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1679
1680    path = ocfs2_new_path_from_et(&et);
1681    if (!path) {
1682        ret = -ENOMEM;
1683        mlog_errno(ret);
1684        goto out;
1685    }
1686
1687    ret = ocfs2_find_path(ci, path, old_cpos);
1688    if (ret) {
1689        mlog_errno(ret);
1690        goto out;
1691    }
1692
1693    /*
1694     * 2 more credits, one for the leaf refcount block, one for
1695     * the extent block contains the extent rec.
1696     */
1697    ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
1698    if (ret < 0) {
1699        mlog_errno(ret);
1700        goto out;
1701    }
1702
1703    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1704                      OCFS2_JOURNAL_ACCESS_WRITE);
1705    if (ret < 0) {
1706        mlog_errno(ret);
1707        goto out;
1708    }
1709
1710    ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1711                      OCFS2_JOURNAL_ACCESS_WRITE);
1712    if (ret < 0) {
1713        mlog_errno(ret);
1714        goto out;
1715    }
1716
1717    /* change the leaf extent block first. */
1718    el = path_leaf_el(path);
1719
1720    for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1721        if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1722            break;
1723
1724    BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1725
1726    el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1727
1728    /* change the r_cpos in the leaf block. */
1729    rb->rf_cpos = cpu_to_le32(new_cpos);
1730
1731    ocfs2_journal_dirty(handle, path_leaf_bh(path));
1732    ocfs2_journal_dirty(handle, ref_leaf_bh);
1733
1734out:
1735    ocfs2_free_path(path);
1736    return ret;
1737}
1738
1739static int ocfs2_insert_refcount_rec(handle_t *handle,
1740                     struct ocfs2_caching_info *ci,
1741                     struct buffer_head *ref_root_bh,
1742                     struct buffer_head *ref_leaf_bh,
1743                     struct ocfs2_refcount_rec *rec,
1744                     int index, int merge,
1745                     struct ocfs2_alloc_context *meta_ac)
1746{
1747    int ret;
1748    struct ocfs2_refcount_block *rb =
1749            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1750    struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1751    struct buffer_head *new_bh = NULL;
1752
1753    BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1754
1755    if (rf_list->rl_used == rf_list->rl_count) {
1756        u64 cpos = le64_to_cpu(rec->r_cpos);
1757        u32 len = le32_to_cpu(rec->r_clusters);
1758
1759        ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1760                         ref_leaf_bh, meta_ac);
1761        if (ret) {
1762            mlog_errno(ret);
1763            goto out;
1764        }
1765
1766        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1767                         cpos, len, NULL, &index,
1768                         &new_bh);
1769        if (ret) {
1770            mlog_errno(ret);
1771            goto out;
1772        }
1773
1774        ref_leaf_bh = new_bh;
1775        rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1776        rf_list = &rb->rf_records;
1777    }
1778
1779    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1780                      OCFS2_JOURNAL_ACCESS_WRITE);
1781    if (ret) {
1782        mlog_errno(ret);
1783        goto out;
1784    }
1785
1786    if (index < le16_to_cpu(rf_list->rl_used))
1787        memmove(&rf_list->rl_recs[index + 1],
1788            &rf_list->rl_recs[index],
1789            (le16_to_cpu(rf_list->rl_used) - index) *
1790             sizeof(struct ocfs2_refcount_rec));
1791
1792    mlog(0, "insert refcount record start %llu, len %u, count %u "
1793         "to leaf block %llu at index %d\n",
1794         (unsigned long long)le64_to_cpu(rec->r_cpos),
1795         le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
1796         (unsigned long long)ref_leaf_bh->b_blocknr, index);
1797
1798    rf_list->rl_recs[index] = *rec;
1799
1800    le16_add_cpu(&rf_list->rl_used, 1);
1801
1802    if (merge)
1803        ocfs2_refcount_rec_merge(rb, index);
1804
1805    ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1806    if (ret) {
1807        mlog_errno(ret);
1808        goto out;
1809    }
1810
1811    if (index == 0) {
1812        ret = ocfs2_adjust_refcount_rec(handle, ci,
1813                        ref_root_bh,
1814                        ref_leaf_bh, rec);
1815        if (ret)
1816            mlog_errno(ret);
1817    }
1818out:
1819    brelse(new_bh);
1820    return ret;
1821}
1822
1823/*
1824 * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1825 * This is much simple than our b-tree code.
1826 * split_rec is the new refcount rec we want to insert.
1827 * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1828 * increase refcount or decrease a refcount to non-zero).
1829 * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1830 * rec( in case we decrease a refcount to zero).
1831 */
1832static int ocfs2_split_refcount_rec(handle_t *handle,
1833                    struct ocfs2_caching_info *ci,
1834                    struct buffer_head *ref_root_bh,
1835                    struct buffer_head *ref_leaf_bh,
1836                    struct ocfs2_refcount_rec *split_rec,
1837                    int index, int merge,
1838                    struct ocfs2_alloc_context *meta_ac,
1839                    struct ocfs2_cached_dealloc_ctxt *dealloc)
1840{
1841    int ret, recs_need;
1842    u32 len;
1843    struct ocfs2_refcount_block *rb =
1844            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1845    struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1846    struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1847    struct ocfs2_refcount_rec *tail_rec = NULL;
1848    struct buffer_head *new_bh = NULL;
1849
1850    BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1851
1852    mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
1853         le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
1854         le64_to_cpu(split_rec->r_cpos),
1855         le32_to_cpu(split_rec->r_clusters));
1856
1857    /*
1858     * If we just need to split the header or tail clusters,
1859     * no more recs are needed, just split is OK.
1860     * Otherwise we at least need one new recs.
1861     */
1862    if (!split_rec->r_refcount &&
1863        (split_rec->r_cpos == orig_rec->r_cpos ||
1864         le64_to_cpu(split_rec->r_cpos) +
1865         le32_to_cpu(split_rec->r_clusters) ==
1866         le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1867        recs_need = 0;
1868    else
1869        recs_need = 1;
1870
1871    /*
1872     * We need one more rec if we split in the middle and the new rec have
1873     * some refcount in it.
1874     */
1875    if (split_rec->r_refcount &&
1876        (split_rec->r_cpos != orig_rec->r_cpos &&
1877         le64_to_cpu(split_rec->r_cpos) +
1878         le32_to_cpu(split_rec->r_clusters) !=
1879         le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1880        recs_need++;
1881
1882    /* If the leaf block don't have enough record, expand it. */
1883    if (le16_to_cpu(rf_list->rl_used) + recs_need >
1884                     le16_to_cpu(rf_list->rl_count)) {
1885        struct ocfs2_refcount_rec tmp_rec;
1886        u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1887        len = le32_to_cpu(orig_rec->r_clusters);
1888        ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1889                         ref_leaf_bh, meta_ac);
1890        if (ret) {
1891            mlog_errno(ret);
1892            goto out;
1893        }
1894
1895        /*
1896         * We have to re-get it since now cpos may be moved to
1897         * another leaf block.
1898         */
1899        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1900                         cpos, len, &tmp_rec, &index,
1901                         &new_bh);
1902        if (ret) {
1903            mlog_errno(ret);
1904            goto out;
1905        }
1906
1907        ref_leaf_bh = new_bh;
1908        rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1909        rf_list = &rb->rf_records;
1910        orig_rec = &rf_list->rl_recs[index];
1911    }
1912
1913    ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1914                      OCFS2_JOURNAL_ACCESS_WRITE);
1915    if (ret) {
1916        mlog_errno(ret);
1917        goto out;
1918    }
1919
1920    /*
1921     * We have calculated out how many new records we need and store
1922     * in recs_need, so spare enough space first by moving the records
1923     * after "index" to the end.
1924     */
1925    if (index != le16_to_cpu(rf_list->rl_used) - 1)
1926        memmove(&rf_list->rl_recs[index + 1 + recs_need],
1927            &rf_list->rl_recs[index + 1],
1928            (le16_to_cpu(rf_list->rl_used) - index - 1) *
1929             sizeof(struct ocfs2_refcount_rec));
1930
1931    len = (le64_to_cpu(orig_rec->r_cpos) +
1932          le32_to_cpu(orig_rec->r_clusters)) -
1933          (le64_to_cpu(split_rec->r_cpos) +
1934          le32_to_cpu(split_rec->r_clusters));
1935
1936    /*
1937     * If we have "len", the we will split in the tail and move it
1938     * to the end of the space we have just spared.
1939     */
1940    if (len) {
1941        tail_rec = &rf_list->rl_recs[index + recs_need];
1942
1943        memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1944        le64_add_cpu(&tail_rec->r_cpos,
1945                 le32_to_cpu(tail_rec->r_clusters) - len);
1946        tail_rec->r_clusters = cpu_to_le32(len);
1947    }
1948
1949    /*
1950     * If the split pos isn't the same as the original one, we need to
1951     * split in the head.
1952     *
1953     * Note: We have the chance that split_rec.r_refcount = 0,
1954     * recs_need = 0 and len > 0, which means we just cut the head from
1955     * the orig_rec and in that case we have done some modification in
1956     * orig_rec above, so the check for r_cpos is faked.
1957     */
1958    if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1959        len = le64_to_cpu(split_rec->r_cpos) -
1960              le64_to_cpu(orig_rec->r_cpos);
1961        orig_rec->r_clusters = cpu_to_le32(len);
1962        index++;
1963    }
1964
1965    le16_add_cpu(&rf_list->rl_used, recs_need);
1966
1967    if (split_rec->r_refcount) {
1968        rf_list->rl_recs[index] = *split_rec;
1969        mlog(0, "insert refcount record start %llu, len %u, count %u "
1970             "to leaf block %llu at index %d\n",
1971             (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1972             le32_to_cpu(split_rec->r_clusters),
1973             le32_to_cpu(split_rec->r_refcount),
1974             (unsigned long long)ref_leaf_bh->b_blocknr, index);
1975
1976        if (merge)
1977            ocfs2_refcount_rec_merge(rb, index);
1978    }
1979
1980    ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1981    if (ret)
1982        mlog_errno(ret);
1983
1984out:
1985    brelse(new_bh);
1986    return ret;
1987}
1988
1989static int __ocfs2_increase_refcount(handle_t *handle,
1990                     struct ocfs2_caching_info *ci,
1991                     struct buffer_head *ref_root_bh,
1992                     u64 cpos, u32 len, int merge,
1993                     struct ocfs2_alloc_context *meta_ac,
1994                     struct ocfs2_cached_dealloc_ctxt *dealloc)
1995{
1996    int ret = 0, index;
1997    struct buffer_head *ref_leaf_bh = NULL;
1998    struct ocfs2_refcount_rec rec;
1999    unsigned int set_len = 0;
2000
2001    mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
2002         (unsigned long long)ocfs2_metadata_cache_owner(ci),
2003         (unsigned long long)cpos, len);
2004
2005    while (len) {
2006        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2007                         cpos, len, &rec, &index,
2008                         &ref_leaf_bh);
2009        if (ret) {
2010            mlog_errno(ret);
2011            goto out;
2012        }
2013
2014        set_len = le32_to_cpu(rec.r_clusters);
2015
2016        /*
2017         * Here we may meet with 3 situations:
2018         *
2019         * 1. If we find an already existing record, and the length
2020         * is the same, cool, we just need to increase the r_refcount
2021         * and it is OK.
2022         * 2. If we find a hole, just insert it with r_refcount = 1.
2023         * 3. If we are in the middle of one extent record, split
2024         * it.
2025         */
2026        if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
2027            set_len <= len) {
2028            mlog(0, "increase refcount rec, start %llu, len %u, "
2029                 "count %u\n", (unsigned long long)cpos, set_len,
2030                 le32_to_cpu(rec.r_refcount));
2031            ret = ocfs2_change_refcount_rec(handle, ci,
2032                            ref_leaf_bh, index,
2033                            merge, 1);
2034            if (ret) {
2035                mlog_errno(ret);
2036                goto out;
2037            }
2038        } else if (!rec.r_refcount) {
2039            rec.r_refcount = cpu_to_le32(1);
2040
2041            mlog(0, "insert refcount rec, start %llu, len %u\n",
2042                 (unsigned long long)le64_to_cpu(rec.r_cpos),
2043                 set_len);
2044            ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
2045                            ref_leaf_bh,
2046                            &rec, index,
2047                            merge, meta_ac);
2048            if (ret) {
2049                mlog_errno(ret);
2050                goto out;
2051            }
2052        } else {
2053            set_len = min((u64)(cpos + len),
2054                      le64_to_cpu(rec.r_cpos) + set_len) - cpos;
2055            rec.r_cpos = cpu_to_le64(cpos);
2056            rec.r_clusters = cpu_to_le32(set_len);
2057            le32_add_cpu(&rec.r_refcount, 1);
2058
2059            mlog(0, "split refcount rec, start %llu, "
2060                 "len %u, count %u\n",
2061                 (unsigned long long)le64_to_cpu(rec.r_cpos),
2062                 set_len, le32_to_cpu(rec.r_refcount));
2063            ret = ocfs2_split_refcount_rec(handle, ci,
2064                               ref_root_bh, ref_leaf_bh,
2065                               &rec, index, merge,
2066                               meta_ac, dealloc);
2067            if (ret) {
2068                mlog_errno(ret);
2069                goto out;
2070            }
2071        }
2072
2073        cpos += set_len;
2074        len -= set_len;
2075        brelse(ref_leaf_bh);
2076        ref_leaf_bh = NULL;
2077    }
2078
2079out:
2080    brelse(ref_leaf_bh);
2081    return ret;
2082}
2083
2084static int ocfs2_remove_refcount_extent(handle_t *handle,
2085                struct ocfs2_caching_info *ci,
2086                struct buffer_head *ref_root_bh,
2087                struct buffer_head *ref_leaf_bh,
2088                struct ocfs2_alloc_context *meta_ac,
2089                struct ocfs2_cached_dealloc_ctxt *dealloc)
2090{
2091    int ret;
2092    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2093    struct ocfs2_refcount_block *rb =
2094            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2095    struct ocfs2_extent_tree et;
2096
2097    BUG_ON(rb->rf_records.rl_used);
2098
2099    ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2100    ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2101                  1, meta_ac, dealloc);
2102    if (ret) {
2103        mlog_errno(ret);
2104        goto out;
2105    }
2106
2107    ocfs2_remove_from_cache(ci, ref_leaf_bh);
2108
2109    /*
2110     * add the freed block to the dealloc so that it will be freed
2111     * when we run dealloc.
2112     */
2113    ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2114                    le16_to_cpu(rb->rf_suballoc_slot),
2115                    le64_to_cpu(rb->rf_blkno),
2116                    le16_to_cpu(rb->rf_suballoc_bit));
2117    if (ret) {
2118        mlog_errno(ret);
2119        goto out;
2120    }
2121
2122    ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2123                      OCFS2_JOURNAL_ACCESS_WRITE);
2124    if (ret) {
2125        mlog_errno(ret);
2126        goto out;
2127    }
2128
2129    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2130
2131    le32_add_cpu(&rb->rf_clusters, -1);
2132
2133    /*
2134     * check whether we need to restore the root refcount block if
2135     * there is no leaf extent block at atll.
2136     */
2137    if (!rb->rf_list.l_next_free_rec) {
2138        BUG_ON(rb->rf_clusters);
2139
2140        mlog(0, "reset refcount tree root %llu to be a record block.\n",
2141             (unsigned long long)ref_root_bh->b_blocknr);
2142
2143        rb->rf_flags = 0;
2144        rb->rf_parent = 0;
2145        rb->rf_cpos = 0;
2146        memset(&rb->rf_records, 0, sb->s_blocksize -
2147               offsetof(struct ocfs2_refcount_block, rf_records));
2148        rb->rf_records.rl_count =
2149                cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2150    }
2151
2152    ocfs2_journal_dirty(handle, ref_root_bh);
2153
2154out:
2155    return ret;
2156}
2157
2158int ocfs2_increase_refcount(handle_t *handle,
2159                struct ocfs2_caching_info *ci,
2160                struct buffer_head *ref_root_bh,
2161                u64 cpos, u32 len,
2162                struct ocfs2_alloc_context *meta_ac,
2163                struct ocfs2_cached_dealloc_ctxt *dealloc)
2164{
2165    return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2166                     cpos, len, 1,
2167                     meta_ac, dealloc);
2168}
2169
2170static int ocfs2_decrease_refcount_rec(handle_t *handle,
2171                struct ocfs2_caching_info *ci,
2172                struct buffer_head *ref_root_bh,
2173                struct buffer_head *ref_leaf_bh,
2174                int index, u64 cpos, unsigned int len,
2175                struct ocfs2_alloc_context *meta_ac,
2176                struct ocfs2_cached_dealloc_ctxt *dealloc)
2177{
2178    int ret;
2179    struct ocfs2_refcount_block *rb =
2180            (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2181    struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2182
2183    BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2184    BUG_ON(cpos + len >
2185           le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2186
2187    if (cpos == le64_to_cpu(rec->r_cpos) &&
2188        len == le32_to_cpu(rec->r_clusters))
2189        ret = ocfs2_change_refcount_rec(handle, ci,
2190                        ref_leaf_bh, index, 1, -1);
2191    else {
2192        struct ocfs2_refcount_rec split = *rec;
2193        split.r_cpos = cpu_to_le64(cpos);
2194        split.r_clusters = cpu_to_le32(len);
2195
2196        le32_add_cpu(&split.r_refcount, -1);
2197
2198        mlog(0, "split refcount rec, start %llu, "
2199             "len %u, count %u, original start %llu, len %u\n",
2200             (unsigned long long)le64_to_cpu(split.r_cpos),
2201             len, le32_to_cpu(split.r_refcount),
2202             (unsigned long long)le64_to_cpu(rec->r_cpos),
2203             le32_to_cpu(rec->r_clusters));
2204        ret = ocfs2_split_refcount_rec(handle, ci,
2205                           ref_root_bh, ref_leaf_bh,
2206                           &split, index, 1,
2207                           meta_ac, dealloc);
2208    }
2209
2210    if (ret) {
2211        mlog_errno(ret);
2212        goto out;
2213    }
2214
2215    /* Remove the leaf refcount block if it contains no refcount record. */
2216    if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2217        ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2218                           ref_leaf_bh, meta_ac,
2219                           dealloc);
2220        if (ret)
2221            mlog_errno(ret);
2222    }
2223
2224out:
2225    return ret;
2226}
2227
2228static int __ocfs2_decrease_refcount(handle_t *handle,
2229                     struct ocfs2_caching_info *ci,
2230                     struct buffer_head *ref_root_bh,
2231                     u64 cpos, u32 len,
2232                     struct ocfs2_alloc_context *meta_ac,
2233                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2234                     int delete)
2235{
2236    int ret = 0, index = 0;
2237    struct ocfs2_refcount_rec rec;
2238    unsigned int r_count = 0, r_len;
2239    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2240    struct buffer_head *ref_leaf_bh = NULL;
2241
2242    mlog(0, "Tree owner %llu, decrease refcount start %llu, "
2243         "len %u, delete %u\n",
2244         (unsigned long long)ocfs2_metadata_cache_owner(ci),
2245         (unsigned long long)cpos, len, delete);
2246
2247    while (len) {
2248        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2249                         cpos, len, &rec, &index,
2250                         &ref_leaf_bh);
2251        if (ret) {
2252            mlog_errno(ret);
2253            goto out;
2254        }
2255
2256        r_count = le32_to_cpu(rec.r_refcount);
2257        BUG_ON(r_count == 0);
2258        if (!delete)
2259            BUG_ON(r_count > 1);
2260
2261        r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2262                  le32_to_cpu(rec.r_clusters)) - cpos;
2263
2264        ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2265                          ref_leaf_bh, index,
2266                          cpos, r_len,
2267                          meta_ac, dealloc);
2268        if (ret) {
2269            mlog_errno(ret);
2270            goto out;
2271        }
2272
2273        if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2274            ret = ocfs2_cache_cluster_dealloc(dealloc,
2275                      ocfs2_clusters_to_blocks(sb, cpos),
2276                              r_len);
2277            if (ret) {
2278                mlog_errno(ret);
2279                goto out;
2280            }
2281        }
2282
2283        cpos += r_len;
2284        len -= r_len;
2285        brelse(ref_leaf_bh);
2286        ref_leaf_bh = NULL;
2287    }
2288
2289out:
2290    brelse(ref_leaf_bh);
2291    return ret;
2292}
2293
2294/* Caller must hold refcount tree lock. */
2295int ocfs2_decrease_refcount(struct inode *inode,
2296                handle_t *handle, u32 cpos, u32 len,
2297                struct ocfs2_alloc_context *meta_ac,
2298                struct ocfs2_cached_dealloc_ctxt *dealloc,
2299                int delete)
2300{
2301    int ret;
2302    u64 ref_blkno;
2303    struct ocfs2_inode_info *oi = OCFS2_I(inode);
2304    struct buffer_head *ref_root_bh = NULL;
2305    struct ocfs2_refcount_tree *tree;
2306
2307    BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2308
2309    ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2310    if (ret) {
2311        mlog_errno(ret);
2312        goto out;
2313    }
2314
2315    ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2316    if (ret) {
2317        mlog_errno(ret);
2318        goto out;
2319    }
2320
2321    ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2322                    &ref_root_bh);
2323    if (ret) {
2324        mlog_errno(ret);
2325        goto out;
2326    }
2327
2328    ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2329                    cpos, len, meta_ac, dealloc, delete);
2330    if (ret)
2331        mlog_errno(ret);
2332out:
2333    brelse(ref_root_bh);
2334    return ret;
2335}
2336
2337/*
2338 * Mark the already-existing extent at cpos as refcounted for len clusters.
2339 * This adds the refcount extent flag.
2340 *
2341 * If the existing extent is larger than the request, initiate a
2342 * split. An attempt will be made at merging with adjacent extents.
2343 *
2344 * The caller is responsible for passing down meta_ac if we'll need it.
2345 */
2346static int ocfs2_mark_extent_refcounted(struct inode *inode,
2347                struct ocfs2_extent_tree *et,
2348                handle_t *handle, u32 cpos,
2349                u32 len, u32 phys,
2350                struct ocfs2_alloc_context *meta_ac,
2351                struct ocfs2_cached_dealloc_ctxt *dealloc)
2352{
2353    int ret;
2354
2355    mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
2356         inode->i_ino, cpos, len, phys);
2357
2358    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2359        ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2360                "tree, but the feature bit is not set in the "
2361                "super block.", inode->i_ino);
2362        ret = -EROFS;
2363        goto out;
2364    }
2365
2366    ret = ocfs2_change_extent_flag(handle, et, cpos,
2367                       len, phys, meta_ac, dealloc,
2368                       OCFS2_EXT_REFCOUNTED, 0);
2369    if (ret)
2370        mlog_errno(ret);
2371
2372out:
2373    return ret;
2374}
2375
2376/*
2377 * Given some contiguous physical clusters, calculate what we need
2378 * for modifying their refcount.
2379 */
2380static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2381                        struct ocfs2_caching_info *ci,
2382                        struct buffer_head *ref_root_bh,
2383                        u64 start_cpos,
2384                        u32 clusters,
2385                        int *meta_add,
2386                        int *credits)
2387{
2388    int ret = 0, index, ref_blocks = 0, recs_add = 0;
2389    u64 cpos = start_cpos;
2390    struct ocfs2_refcount_block *rb;
2391    struct ocfs2_refcount_rec rec;
2392    struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2393    u32 len;
2394
2395    mlog(0, "start_cpos %llu, clusters %u\n",
2396         (unsigned long long)start_cpos, clusters);
2397    while (clusters) {
2398        ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2399                         cpos, clusters, &rec,
2400                         &index, &ref_leaf_bh);
2401        if (ret) {
2402            mlog_errno(ret);
2403            goto out;
2404        }
2405
2406        if (ref_leaf_bh != prev_bh) {
2407            /*
2408             * Now we encounter a new leaf block, so calculate
2409             * whether we need to extend the old leaf.
2410             */
2411            if (prev_bh) {
2412                rb = (struct ocfs2_refcount_block *)
2413                            prev_bh->b_data;
2414
2415                if (le64_to_cpu(rb->rf_records.rl_used) +
2416                    recs_add >
2417                    le16_to_cpu(rb->rf_records.rl_count))
2418                    ref_blocks++;
2419            }
2420
2421            recs_add = 0;
2422            *credits += 1;
2423            brelse(prev_bh);
2424            prev_bh = ref_leaf_bh;
2425            get_bh(prev_bh);
2426        }
2427
2428        rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2429
2430        mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
2431             "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
2432             recs_add, (unsigned long long)cpos, clusters,
2433             (unsigned long long)le64_to_cpu(rec.r_cpos),
2434             le32_to_cpu(rec.r_clusters),
2435             le32_to_cpu(rec.r_refcount), index);
2436
2437        len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2438              le32_to_cpu(rec.r_clusters)) - cpos;
2439        /*
2440         * If the refcount rec already exist, cool. We just need
2441         * to check whether there is a split. Otherwise we just need
2442         * to increase the refcount.
2443         * If we will insert one, increases recs_add.
2444         *
2445         * We record all the records which will be inserted to the
2446         * same refcount block, so that we can tell exactly whether
2447         * we need a new refcount block or not.
2448         */
2449        if (rec.r_refcount) {
2450            /* Check whether we need a split at the beginning. */
2451            if (cpos == start_cpos &&
2452                cpos != le64_to_cpu(rec.r_cpos))
2453                recs_add++;
2454
2455            /* Check whether we need a split in the end. */
2456            if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2457                le32_to_cpu(rec.r_clusters))
2458                recs_add++;
2459        } else
2460            recs_add++;
2461
2462        brelse(ref_leaf_bh);
2463        ref_leaf_bh = NULL;
2464        clusters -= len;
2465        cpos += len;
2466    }
2467
2468    if (prev_bh) {
2469        rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2470
2471        if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
2472            le16_to_cpu(rb->rf_records.rl_count))
2473            ref_blocks++;
2474
2475        *credits += 1;
2476    }
2477
2478    if (!ref_blocks)
2479        goto out;
2480
2481    mlog(0, "we need ref_blocks %d\n", ref_blocks);
2482    *meta_add += ref_blocks;
2483    *credits += ref_blocks;
2484
2485    /*
2486     * So we may need ref_blocks to insert into the tree.
2487     * That also means we need to change the b-tree and add that number
2488     * of records since we never merge them.
2489     * We need one more block for expansion since the new created leaf
2490     * block is also full and needs split.
2491     */
2492    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2493    if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
2494        struct ocfs2_extent_tree et;
2495
2496        ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2497        *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2498        *credits += ocfs2_calc_extend_credits(sb,
2499                              et.et_root_el,
2500                              ref_blocks);
2501    } else {
2502        *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
2503        *meta_add += 1;
2504    }
2505
2506out:
2507    brelse(ref_leaf_bh);
2508    brelse(prev_bh);
2509    return ret;
2510}
2511
2512/*
2513 * For refcount tree, we will decrease some contiguous clusters
2514 * refcount count, so just go through it to see how many blocks
2515 * we gonna touch and whether we need to create new blocks.
2516 *
2517 * Normally the refcount blocks store these refcount should be
2518 * contiguous also, so that we can get the number easily.
2519 * As for meta_ac, we will at most add split 2 refcount record and
2520 * 2 more refcount block, so just check it in a rough way.
2521 *
2522 * Caller must hold refcount tree lock.
2523 */
2524int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2525                      struct buffer_head *di_bh,
2526                      u64 phys_blkno,
2527                      u32 clusters,
2528                      int *credits,
2529                      struct ocfs2_alloc_context **meta_ac)
2530{
2531    int ret, ref_blocks = 0;
2532    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2533    struct ocfs2_inode_info *oi = OCFS2_I(inode);
2534    struct buffer_head *ref_root_bh = NULL;
2535    struct ocfs2_refcount_tree *tree;
2536    u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2537
2538    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2539        ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2540                "tree, but the feature bit is not set in the "
2541                "super block.", inode->i_ino);
2542        ret = -EROFS;
2543        goto out;
2544    }
2545
2546    BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2547
2548    ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2549                      le64_to_cpu(di->i_refcount_loc), &tree);
2550    if (ret) {
2551        mlog_errno(ret);
2552        goto out;
2553    }
2554
2555    ret = ocfs2_read_refcount_block(&tree->rf_ci,
2556                    le64_to_cpu(di->i_refcount_loc),
2557                    &ref_root_bh);
2558    if (ret) {
2559        mlog_errno(ret);
2560        goto out;
2561    }
2562
2563    ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2564                           &tree->rf_ci,
2565                           ref_root_bh,
2566                           start_cpos, clusters,
2567                           &ref_blocks, credits);
2568    if (ret) {
2569        mlog_errno(ret);
2570        goto out;
2571    }
2572
2573    mlog(0, "reserve new metadata %d, credits = %d\n",
2574         ref_blocks, *credits);
2575
2576    if (ref_blocks) {
2577        ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2578                            ref_blocks, meta_ac);
2579        if (ret)
2580            mlog_errno(ret);
2581    }
2582
2583out:
2584    brelse(ref_root_bh);
2585    return ret;
2586}
2587
2588#define MAX_CONTIG_BYTES 1048576
2589
2590static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2591{
2592    return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2593}
2594
2595static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2596{
2597    return ~(ocfs2_cow_contig_clusters(sb) - 1);
2598}
2599
2600/*
2601 * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2602 * find an offset (start + (n * contig_clusters)) that is closest to cpos
2603 * while still being less than or equal to it.
2604 *
2605 * The goal is to break the extent at a multiple of contig_clusters.
2606 */
2607static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2608                         unsigned int start,
2609                         unsigned int cpos)
2610{
2611    BUG_ON(start > cpos);
2612
2613    return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2614}
2615
2616/*
2617 * Given a cluster count of len, pad it out so that it is a multiple
2618 * of contig_clusters.
2619 */
2620static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2621                          unsigned int len)
2622{
2623    unsigned int padded =
2624        (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2625        ocfs2_cow_contig_mask(sb);
2626
2627    /* Did we wrap? */
2628    if (padded < len)
2629        padded = UINT_MAX;
2630
2631    return padded;
2632}
2633
2634/*
2635 * Calculate out the start and number of virtual clusters we need to to CoW.
2636 *
2637 * cpos is vitual start cluster position we want to do CoW in a
2638 * file and write_len is the cluster length.
2639 * max_cpos is the place where we want to stop CoW intentionally.
2640 *
2641 * Normal we will start CoW from the beginning of extent record cotaining cpos.
2642 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2643 * get good I/O from the resulting extent tree.
2644 */
2645static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2646                       struct ocfs2_extent_list *el,
2647                       u32 cpos,
2648                       u32 write_len,
2649                       u32 max_cpos,
2650                       u32 *cow_start,
2651                       u32 *cow_len)
2652{
2653    int ret = 0;
2654    int tree_height = le16_to_cpu(el->l_tree_depth), i;
2655    struct buffer_head *eb_bh = NULL;
2656    struct ocfs2_extent_block *eb = NULL;
2657    struct ocfs2_extent_rec *rec;
2658    unsigned int want_clusters, rec_end = 0;
2659    int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2660    int leaf_clusters;
2661
2662    BUG_ON(cpos + write_len > max_cpos);
2663
2664    if (tree_height > 0) {
2665        ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2666        if (ret) {
2667            mlog_errno(ret);
2668            goto out;
2669        }
2670
2671        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2672        el = &eb->h_list;
2673
2674        if (el->l_tree_depth) {
2675            ocfs2_error(inode->i_sb,
2676                    "Inode %lu has non zero tree depth in "
2677                    "leaf block %llu\n", inode->i_ino,
2678                    (unsigned long long)eb_bh->b_blocknr);
2679            ret = -EROFS;
2680            goto out;
2681        }
2682    }
2683
2684    *cow_len = 0;
2685    for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2686        rec = &el->l_recs[i];
2687
2688        if (ocfs2_is_empty_extent(rec)) {
2689            mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2690                    "index %d\n", inode->i_ino, i);
2691            continue;
2692        }
2693
2694        if (le32_to_cpu(rec->e_cpos) +
2695            le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2696            continue;
2697
2698        if (*cow_len == 0) {
2699            /*
2700             * We should find a refcounted record in the
2701             * first pass.
2702             */
2703            BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2704            *cow_start = le32_to_cpu(rec->e_cpos);
2705        }
2706
2707        /*
2708         * If we encounter a hole, a non-refcounted record or
2709         * pass the max_cpos, stop the search.
2710         */
2711        if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2712            (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2713            (max_cpos <= le32_to_cpu(rec->e_cpos)))
2714            break;
2715
2716        leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2717        rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2718        if (rec_end > max_cpos) {
2719            rec_end = max_cpos;
2720            leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2721        }
2722
2723        /*
2724         * How many clusters do we actually need from
2725         * this extent? First we see how many we actually
2726         * need to complete the write. If that's smaller
2727         * than contig_clusters, we try for contig_clusters.
2728         */
2729        if (!*cow_len)
2730            want_clusters = write_len;
2731        else
2732            want_clusters = (cpos + write_len) -
2733                (*cow_start + *cow_len);
2734        if (want_clusters < contig_clusters)
2735            want_clusters = contig_clusters;
2736
2737        /*
2738         * If the write does not cover the whole extent, we
2739         * need to calculate how we're going to split the extent.
2740         * We try to do it on contig_clusters boundaries.
2741         *
2742         * Any extent smaller than contig_clusters will be
2743         * CoWed in its entirety.
2744         */
2745        if (leaf_clusters <= contig_clusters)
2746            *cow_len += leaf_clusters;
2747        else if (*cow_len || (*cow_start == cpos)) {
2748            /*
2749             * This extent needs to be CoW'd from its
2750             * beginning, so all we have to do is compute
2751             * how many clusters to grab. We align
2752             * want_clusters to the edge of contig_clusters
2753             * to get better I/O.
2754             */
2755            want_clusters = ocfs2_cow_align_length(inode->i_sb,
2756                                   want_clusters);
2757
2758            if (leaf_clusters < want_clusters)
2759                *cow_len += leaf_clusters;
2760            else
2761                *cow_len += want_clusters;
2762        } else if ((*cow_start + contig_clusters) >=
2763               (cpos + write_len)) {
2764            /*
2765             * Breaking off contig_clusters at the front
2766             * of the extent will cover our write. That's
2767             * easy.
2768             */
2769            *cow_len = contig_clusters;
2770        } else if ((rec_end - cpos) <= contig_clusters) {
2771            /*
2772             * Breaking off contig_clusters at the tail of
2773             * this extent will cover cpos.
2774             */
2775            *cow_start = rec_end - contig_clusters;
2776            *cow_len = contig_clusters;
2777        } else if ((rec_end - cpos) <= want_clusters) {
2778            /*
2779             * While we can't fit the entire write in this
2780             * extent, we know that the write goes from cpos
2781             * to the end of the extent. Break that off.
2782             * We try to break it at some multiple of
2783             * contig_clusters from the front of the extent.
2784             * Failing that (ie, cpos is within
2785             * contig_clusters of the front), we'll CoW the
2786             * entire extent.
2787             */
2788            *cow_start = ocfs2_cow_align_start(inode->i_sb,
2789                               *cow_start, cpos);
2790            *cow_len = rec_end - *cow_start;
2791        } else {
2792            /*
2793             * Ok, the entire write lives in the middle of
2794             * this extent. Let's try to slice the extent up
2795             * nicely. Optimally, our CoW region starts at
2796             * m*contig_clusters from the beginning of the
2797             * extent and goes for n*contig_clusters,
2798             * covering the entire write.
2799             */
2800            *cow_start = ocfs2_cow_align_start(inode->i_sb,
2801                               *cow_start, cpos);
2802
2803            want_clusters = (cpos + write_len) - *cow_start;
2804            want_clusters = ocfs2_cow_align_length(inode->i_sb,
2805                                   want_clusters);
2806            if (*cow_start + want_clusters <= rec_end)
2807                *cow_len = want_clusters;
2808            else
2809                *cow_len = rec_end - *cow_start;
2810        }
2811
2812        /* Have we covered our entire write yet? */
2813        if ((*cow_start + *cow_len) >= (cpos + write_len))
2814            break;
2815
2816        /*
2817         * If we reach the end of the extent block and don't get enough
2818         * clusters, continue with the next extent block if possible.
2819         */
2820        if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2821            eb && eb->h_next_leaf_blk) {
2822            brelse(eb_bh);
2823            eb_bh = NULL;
2824
2825            ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2826                           le64_to_cpu(eb->h_next_leaf_blk),
2827                           &eb_bh);
2828            if (ret) {
2829                mlog_errno(ret);
2830                goto out;
2831            }
2832
2833            eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2834            el = &eb->h_list;
2835            i = -1;
2836        }
2837    }
2838
2839out:
2840    brelse(eb_bh);
2841    return ret;
2842}
2843
2844/*
2845 * Prepare meta_ac, data_ac and calculate credits when we want to add some
2846 * num_clusters in data_tree "et" and change the refcount for the old
2847 * clusters(starting form p_cluster) in the refcount tree.
2848 *
2849 * Note:
2850 * 1. since we may split the old tree, so we at most will need num_clusters + 2
2851 * more new leaf records.
2852 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2853 * just give data_ac = NULL.
2854 */
2855static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2856                    u32 p_cluster, u32 num_clusters,
2857                    struct ocfs2_extent_tree *et,
2858                    struct ocfs2_caching_info *ref_ci,
2859                    struct buffer_head *ref_root_bh,
2860                    struct ocfs2_alloc_context **meta_ac,
2861                    struct ocfs2_alloc_context **data_ac,
2862                    int *credits)
2863{
2864    int ret = 0, meta_add = 0;
2865    int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2866
2867    if (num_free_extents < 0) {
2868        ret = num_free_extents;
2869        mlog_errno(ret);
2870        goto out;
2871    }
2872
2873    if (num_free_extents < num_clusters + 2)
2874        meta_add =
2875            ocfs2_extend_meta_needed(et->et_root_el);
2876
2877    *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2878                          num_clusters + 2);
2879
2880    ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2881                           p_cluster, num_clusters,
2882                           &meta_add, credits);
2883    if (ret) {
2884        mlog_errno(ret);
2885        goto out;
2886    }
2887
2888    mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
2889         meta_add, num_clusters, *credits);
2890    ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2891                        meta_ac);
2892    if (ret) {
2893        mlog_errno(ret);
2894        goto out;
2895    }
2896
2897    if (data_ac) {
2898        ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2899                         data_ac);
2900        if (ret)
2901            mlog_errno(ret);
2902    }
2903
2904out:
2905    if (ret) {
2906        if (*meta_ac) {
2907            ocfs2_free_alloc_context(*meta_ac);
2908            *meta_ac = NULL;
2909        }
2910    }
2911
2912    return ret;
2913}
2914
2915static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2916{
2917    BUG_ON(buffer_dirty(bh));
2918
2919    clear_buffer_mapped(bh);
2920
2921    return 0;
2922}
2923
2924static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2925                        struct ocfs2_cow_context *context,
2926                        u32 cpos, u32 old_cluster,
2927                        u32 new_cluster, u32 new_len)
2928{
2929    int ret = 0, partial;
2930    struct ocfs2_caching_info *ci = context->data_et.et_ci;
2931    struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2932    u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2933    struct page *page;
2934    pgoff_t page_index;
2935    unsigned int from, to;
2936    loff_t offset, end, map_end;
2937    struct address_space *mapping = context->inode->i_mapping;
2938
2939    mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2940         new_cluster, new_len, cpos);
2941
2942    offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2943    end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2944
2945    while (offset < end) {
2946        page_index = offset >> PAGE_CACHE_SHIFT;
2947        map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
2948        if (map_end > end)
2949            map_end = end;
2950
2951        /* from, to is the offset within the page. */
2952        from = offset & (PAGE_CACHE_SIZE - 1);
2953        to = PAGE_CACHE_SIZE;
2954        if (map_end & (PAGE_CACHE_SIZE - 1))
2955            to = map_end & (PAGE_CACHE_SIZE - 1);
2956
2957        page = grab_cache_page(mapping, page_index);
2958
2959        /*
2960         * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
2961         * can't be dirtied before we CoW it out.
2962         */
2963        if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
2964            BUG_ON(PageDirty(page));
2965
2966        if (!PageUptodate(page)) {
2967            ret = block_read_full_page(page, ocfs2_get_block);
2968            if (ret) {
2969                mlog_errno(ret);
2970                goto unlock;
2971            }
2972            lock_page(page);
2973        }
2974
2975        if (page_has_buffers(page)) {
2976            ret = walk_page_buffers(handle, page_buffers(page),
2977                        from, to, &partial,
2978                        ocfs2_clear_cow_buffer);
2979            if (ret) {
2980                mlog_errno(ret);
2981                goto unlock;
2982            }
2983        }
2984
2985        ocfs2_map_and_dirty_page(context->inode,
2986                     handle, from, to,
2987                     page, 0, &new_block);
2988        mark_page_accessed(page);
2989unlock:
2990        unlock_page(page);
2991        page_cache_release(page);
2992        page = NULL;
2993        offset = map_end;
2994        if (ret)
2995            break;
2996    }
2997
2998    return ret;
2999}
3000
3001static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3002                       struct ocfs2_cow_context *context,
3003                       u32 cpos, u32 old_cluster,
3004                       u32 new_cluster, u32 new_len)
3005{
3006    int ret = 0;
3007    struct super_block *sb = context->inode->i_sb;
3008    struct ocfs2_caching_info *ci = context->data_et.et_ci;
3009    int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
3010    u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
3011    u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
3012    struct ocfs2_super *osb = OCFS2_SB(sb);
3013    struct buffer_head *old_bh = NULL;
3014    struct buffer_head *new_bh = NULL;
3015
3016    mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
3017         new_cluster, new_len);
3018
3019    for (i = 0; i < blocks; i++, old_block++, new_block++) {
3020        new_bh = sb_getblk(osb->sb, new_block);
3021        if (new_bh == NULL) {
3022            ret = -EIO;
3023            mlog_errno(ret);
3024            break;
3025        }
3026
3027        ocfs2_set_new_buffer_uptodate(ci, new_bh);
3028
3029        ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
3030        if (ret) {
3031            mlog_errno(ret);
3032            break;
3033        }
3034
3035        ret = ocfs2_journal_access(handle, ci, new_bh,
3036                       OCFS2_JOURNAL_ACCESS_CREATE);
3037        if (ret) {
3038            mlog_errno(ret);
3039            break;
3040        }
3041
3042        memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
3043        ret = ocfs2_journal_dirty(handle, new_bh);
3044        if (ret) {
3045            mlog_errno(ret);
3046            break;
3047        }
3048
3049        brelse(new_bh);
3050        brelse(old_bh);
3051        new_bh = NULL;
3052        old_bh = NULL;
3053    }
3054
3055    brelse(new_bh);
3056    brelse(old_bh);
3057    return ret;
3058}
3059
3060static int ocfs2_clear_ext_refcount(handle_t *handle,
3061                    struct ocfs2_extent_tree *et,
3062                    u32 cpos, u32 p_cluster, u32 len,
3063                    unsigned int ext_flags,
3064                    struct ocfs2_alloc_context *meta_ac,
3065                    struct ocfs2_cached_dealloc_ctxt *dealloc)
3066{
3067    int ret, index;
3068    struct ocfs2_extent_rec replace_rec;
3069    struct ocfs2_path *path = NULL;
3070    struct ocfs2_extent_list *el;
3071    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
3072    u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
3073
3074    mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
3075         (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
3076
3077    memset(&replace_rec, 0, sizeof(replace_rec));
3078    replace_rec.e_cpos = cpu_to_le32(cpos);
3079    replace_rec.e_leaf_clusters = cpu_to_le16(len);
3080    replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
3081                                   p_cluster));
3082    replace_rec.e_flags = ext_flags;
3083    replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
3084
3085    path = ocfs2_new_path_from_et(et);
3086    if (!path) {
3087        ret = -ENOMEM;
3088        mlog_errno(ret);
3089        goto out;
3090    }
3091
3092    ret = ocfs2_find_path(et->et_ci, path, cpos);
3093    if (ret) {
3094        mlog_errno(ret);
3095        goto out;
3096    }
3097
3098    el = path_leaf_el(path);
3099
3100    index = ocfs2_search_extent_list(el, cpos);
3101    if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
3102        ocfs2_error(sb,
3103                "Inode %llu has an extent at cpos %u which can no "
3104                "longer be found.\n",
3105                (unsigned long long)ino, cpos);
3106        ret = -EROFS;
3107        goto out;
3108    }
3109
3110    ret = ocfs2_split_extent(handle, et, path, index,
3111                 &replace_rec, meta_ac, dealloc);
3112    if (ret)
3113        mlog_errno(ret);
3114
3115out:
3116    ocfs2_free_path(path);
3117    return ret;
3118}
3119
3120static int ocfs2_replace_clusters(handle_t *handle,
3121                  struct ocfs2_cow_context *context,
3122                  u32 cpos, u32 old,
3123                  u32 new, u32 len,
3124                  unsigned int ext_flags)
3125{
3126    int ret;
3127    struct ocfs2_caching_info *ci = context->data_et.et_ci;
3128    u64 ino = ocfs2_metadata_cache_owner(ci);
3129
3130    mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
3131         (unsigned long long)ino, cpos, old, new, len, ext_flags);
3132
3133    /*If the old clusters is unwritten, no need to duplicate. */
3134    if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3135        ret = context->cow_duplicate_clusters(handle, context, cpos,
3136                              old, new, len);
3137        if (ret) {
3138            mlog_errno(ret);
3139            goto out;
3140        }
3141    }
3142
3143    ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3144                       cpos, new, len, ext_flags,
3145                       context->meta_ac, &context->dealloc);
3146    if (ret)
3147        mlog_errno(ret);
3148out:
3149    return ret;
3150}
3151
3152static int ocfs2_cow_sync_writeback(struct super_block *sb,
3153                    struct ocfs2_cow_context *context,
3154                    u32 cpos, u32 num_clusters)
3155{
3156    int ret = 0;
3157    loff_t offset, end, map_end;
3158    pgoff_t page_index;
3159    struct page *page;
3160
3161    if (ocfs2_should_order_data(context->inode))
3162        return 0;
3163
3164    offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3165    end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3166
3167    ret = filemap_fdatawrite_range(context->inode->i_mapping,
3168                       offset, end - 1);
3169    if (ret < 0) {
3170        mlog_errno(ret);
3171        return ret;
3172    }
3173
3174    while (offset < end) {
3175        page_index = offset >> PAGE_CACHE_SHIFT;
3176        map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
3177        if (map_end > end)
3178            map_end = end;
3179
3180        page = grab_cache_page(context->inode->i_mapping, page_index);
3181        BUG_ON(!page);
3182
3183        wait_on_page_writeback(page);
3184        if (PageError(page)) {
3185            ret = -EIO;
3186            mlog_errno(ret);
3187        } else
3188            mark_page_accessed(page);
3189
3190        unlock_page(page);
3191        page_cache_release(page);
3192        page = NULL;
3193        offset = map_end;
3194        if (ret)
3195            break;
3196    }
3197
3198    return ret;
3199}
3200
3201static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3202                 u32 v_cluster, u32 *p_cluster,
3203                 u32 *num_clusters,
3204                 unsigned int *extent_flags)
3205{
3206    return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3207                  num_clusters, extent_flags);
3208}
3209
3210static int ocfs2_make_clusters_writable(struct super_block *sb,
3211                    struct ocfs2_cow_context *context,
3212                    u32 cpos, u32 p_cluster,
3213                    u32 num_clusters, unsigned int e_flags)
3214{
3215    int ret, delete, index, credits = 0;
3216    u32 new_bit, new_len;
3217    unsigned int set_len;
3218    struct ocfs2_super *osb = OCFS2_SB(sb);
3219    handle_t *handle;
3220    struct buffer_head *ref_leaf_bh = NULL;
3221    struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3222    struct ocfs2_refcount_rec rec;
3223
3224    mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
3225         cpos, p_cluster, num_clusters, e_flags);
3226
3227    ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3228                         &context->data_et,
3229                         ref_ci,
3230                         context->ref_root_bh,
3231                         &context->meta_ac,
3232                         &context->data_ac, &credits);
3233    if (ret) {
3234        mlog_errno(ret);
3235        return ret;
3236    }
3237
3238    if (context->post_refcount)
3239        credits += context->post_refcount->credits;
3240
3241    credits += context->extra_credits;
3242    handle = ocfs2_start_trans(osb, credits);
3243    if (IS_ERR(handle)) {
3244        ret = PTR_ERR(handle);
3245        mlog_errno(ret);
3246        goto out;
3247    }
3248
3249    while (num_clusters) {
3250        ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3251                         p_cluster, num_clusters,
3252                         &rec, &index, &ref_leaf_bh);
3253        if (ret) {
3254            mlog_errno(ret);
3255            goto out_commit;
3256        }
3257
3258        BUG_ON(!rec.r_refcount);
3259        set_len = min((u64)p_cluster + num_clusters,
3260                  le64_to_cpu(rec.r_cpos) +
3261                  le32_to_cpu(rec.r_clusters)) - p_cluster;
3262
3263        /*
3264         * There are many different situation here.
3265         * 1. If refcount == 1, remove the flag and don't COW.
3266         * 2. If refcount > 1, allocate clusters.
3267         * Here we may not allocate r_len once at a time, so continue
3268         * until we reach num_clusters.
3269         */
3270        if (le32_to_cpu(rec.r_refcount) == 1) {
3271            delete = 0;
3272            ret = ocfs2_clear_ext_refcount(handle,
3273                               &context->data_et,
3274                               cpos, p_cluster,
3275                               set_len, e_flags,
3276                               context->meta_ac,
3277                               &context->dealloc);
3278            if (ret) {
3279                mlog_errno(ret);
3280                goto out_commit;
3281            }
3282        } else {
3283            delete = 1;
3284
3285            ret = __ocfs2_claim_clusters(osb, handle,
3286                             context->data_ac,
3287                             1, set_len,
3288                             &new_bit, &new_len);
3289            if (ret) {
3290                mlog_errno(ret);
3291                goto out_commit;
3292            }
3293
3294            ret = ocfs2_replace_clusters(handle, context,
3295                             cpos, p_cluster, new_bit,
3296                             new_len, e_flags);
3297            if (ret) {
3298                mlog_errno(ret);
3299                goto out_commit;
3300            }
3301            set_len = new_len;
3302        }
3303
3304        ret = __ocfs2_decrease_refcount(handle, ref_ci,
3305                        context->ref_root_bh,
3306                        p_cluster, set_len,
3307                        context->meta_ac,
3308                        &context->dealloc, delete);
3309        if (ret) {
3310            mlog_errno(ret);
3311            goto out_commit;
3312        }
3313
3314        cpos += set_len;
3315        p_cluster += set_len;
3316        num_clusters -= set_len;
3317        brelse(ref_leaf_bh);
3318        ref_leaf_bh = NULL;
3319    }
3320
3321    /* handle any post_cow action. */
3322    if (context->post_refcount && context->post_refcount->func) {
3323        ret = context->post_refcount->func(context->inode, handle,
3324                        context->post_refcount->para);
3325        if (ret) {
3326            mlog_errno(ret);
3327            goto out_commit;
3328        }
3329    }
3330
3331    /*
3332     * Here we should write the new page out first if we are
3333     * in write-back mode.
3334     */
3335    if (context->get_clusters == ocfs2_di_get_clusters) {
3336        ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
3337        if (ret)
3338            mlog_errno(ret);
3339    }
3340
3341out_commit:
3342    ocfs2_commit_trans(osb, handle);
3343
3344out:
3345    if (context->data_ac) {
3346        ocfs2_free_alloc_context(context->data_ac);
3347        context->data_ac = NULL;
3348    }
3349    if (context->meta_ac) {
3350        ocfs2_free_alloc_context(context->meta_ac);
3351        context->meta_ac = NULL;
3352    }
3353    brelse(ref_leaf_bh);
3354
3355    return ret;
3356}
3357
3358static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3359{
3360    int ret = 0;
3361    struct inode *inode = context->inode;
3362    u32 cow_start = context->cow_start, cow_len = context->cow_len;
3363    u32 p_cluster, num_clusters;
3364    unsigned int ext_flags;
3365    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3366
3367    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3368        ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3369                "tree, but the feature bit is not set in the "
3370                "super block.", inode->i_ino);
3371        return -EROFS;
3372    }
3373
3374    ocfs2_init_dealloc_ctxt(&context->dealloc);
3375
3376    while (cow_len) {
3377        ret = context->get_clusters(context, cow_start, &p_cluster,
3378                        &num_clusters, &ext_flags);
3379        if (ret) {
3380            mlog_errno(ret);
3381            break;
3382        }
3383
3384        BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3385
3386        if (cow_len < num_clusters)
3387            num_clusters = cow_len;
3388
3389        ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3390                           cow_start, p_cluster,
3391                           num_clusters, ext_flags);
3392        if (ret) {
3393            mlog_errno(ret);
3394            break;
3395        }
3396
3397        cow_len -= num_clusters;
3398        cow_start += num_clusters;
3399    }
3400
3401    if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3402        ocfs2_schedule_truncate_log_flush(osb, 1);
3403        ocfs2_run_deallocs(osb, &context->dealloc);
3404    }
3405
3406    return ret;
3407}
3408
3409/*
3410 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3411 * past max_cpos. This will stop when it runs into a hole or an
3412 * unrefcounted extent.
3413 */
3414static int ocfs2_refcount_cow_hunk(struct inode *inode,
3415                   struct buffer_head *di_bh,
3416                   u32 cpos, u32 write_len, u32 max_cpos)
3417{
3418    int ret;
3419    u32 cow_start = 0, cow_len = 0;
3420    struct ocfs2_inode_info *oi = OCFS2_I(inode);
3421    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3422    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3423    struct buffer_head *ref_root_bh = NULL;
3424    struct ocfs2_refcount_tree *ref_tree;
3425    struct ocfs2_cow_context *context = NULL;
3426
3427    BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3428
3429    ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3430                          cpos, write_len, max_cpos,
3431                          &cow_start, &cow_len);
3432    if (ret) {
3433        mlog_errno(ret);
3434        goto out;
3435    }
3436
3437    mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
3438         "cow_len %u\n", inode->i_ino,
3439         cpos, write_len, cow_start, cow_len);
3440
3441    BUG_ON(cow_len == 0);
3442
3443    context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3444    if (!context) {
3445        ret = -ENOMEM;
3446        mlog_errno(ret);
3447        goto out;
3448    }
3449
3450    ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3451                       1, &ref_tree, &ref_root_bh);
3452    if (ret) {
3453        mlog_errno(ret);
3454        goto out;
3455    }
3456
3457    context->inode = inode;
3458    context->cow_start = cow_start;
3459    context->cow_len = cow_len;
3460    context->ref_tree = ref_tree;
3461    context->ref_root_bh = ref_root_bh;
3462    context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3463    context->get_clusters = ocfs2_di_get_clusters;
3464
3465    ocfs2_init_dinode_extent_tree(&context->data_et,
3466                      INODE_CACHE(inode), di_bh);
3467
3468    ret = ocfs2_replace_cow(context);
3469    if (ret)
3470        mlog_errno(ret);
3471
3472    /*
3473     * truncate the extent map here since no matter whether we meet with
3474     * any error during the action, we shouldn't trust cached extent map
3475     * any more.
3476     */
3477    ocfs2_extent_map_trunc(inode, cow_start);
3478
3479    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3480    brelse(ref_root_bh);
3481out:
3482    kfree(context);
3483    return ret;
3484}
3485
3486/*
3487 * CoW any and all clusters between cpos and cpos+write_len.
3488 * Don't CoW past max_cpos. If this returns successfully, all
3489 * clusters between cpos and cpos+write_len are safe to modify.
3490 */
3491int ocfs2_refcount_cow(struct inode *inode,
3492               struct buffer_head *di_bh,
3493               u32 cpos, u32 write_len, u32 max_cpos)
3494{
3495    int ret = 0;
3496    u32 p_cluster, num_clusters;
3497    unsigned int ext_flags;
3498
3499    while (write_len) {
3500        ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3501                     &num_clusters, &ext_flags);
3502        if (ret) {
3503            mlog_errno(ret);
3504            break;
3505        }
3506
3507        if (write_len < num_clusters)
3508            num_clusters = write_len;
3509
3510        if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3511            ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3512                              num_clusters, max_cpos);
3513            if (ret) {
3514                mlog_errno(ret);
3515                break;
3516            }
3517        }
3518
3519        write_len -= num_clusters;
3520        cpos += num_clusters;
3521    }
3522
3523    return ret;
3524}
3525
3526static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3527                      u32 v_cluster, u32 *p_cluster,
3528                      u32 *num_clusters,
3529                      unsigned int *extent_flags)
3530{
3531    struct inode *inode = context->inode;
3532    struct ocfs2_xattr_value_root *xv = context->cow_object;
3533
3534    return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3535                    num_clusters, &xv->xr_list,
3536                    extent_flags);
3537}
3538
3539/*
3540 * Given a xattr value root, calculate the most meta/credits we need for
3541 * refcount tree change if we truncate it to 0.
3542 */
3543int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3544                       struct ocfs2_caching_info *ref_ci,
3545                       struct buffer_head *ref_root_bh,
3546                       struct ocfs2_xattr_value_root *xv,
3547                       int *meta_add, int *credits)
3548{
3549    int ret = 0, index, ref_blocks = 0;
3550    u32 p_cluster, num_clusters;
3551    u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3552    struct ocfs2_refcount_block *rb;
3553    struct ocfs2_refcount_rec rec;
3554    struct buffer_head *ref_leaf_bh = NULL;
3555
3556    while (cpos < clusters) {
3557        ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3558                           &num_clusters, &xv->xr_list,
3559                           NULL);
3560        if (ret) {
3561            mlog_errno(ret);
3562            goto out;
3563        }
3564
3565        cpos += num_clusters;
3566
3567        while (num_clusters) {
3568            ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3569                             p_cluster, num_clusters,
3570                             &rec, &index,
3571                             &ref_leaf_bh);
3572            if (ret) {
3573                mlog_errno(ret);
3574                goto out;
3575            }
3576
3577            BUG_ON(!rec.r_refcount);
3578
3579            rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3580
3581            /*
3582             * We really don't know whether the other clusters is in
3583             * this refcount block or not, so just take the worst
3584             * case that all the clusters are in this block and each
3585             * one will split a refcount rec, so totally we need
3586             * clusters * 2 new refcount rec.
3587             */
3588            if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3589                le16_to_cpu(rb->rf_records.rl_count))
3590                ref_blocks++;
3591
3592            *credits += 1;
3593            brelse(ref_leaf_bh);
3594            ref_leaf_bh = NULL;
3595
3596            if (num_clusters <= le32_to_cpu(rec.r_clusters))
3597                break;
3598            else
3599                num_clusters -= le32_to_cpu(rec.r_clusters);
3600            p_cluster += num_clusters;
3601        }
3602    }
3603
3604    *meta_add += ref_blocks;
3605    if (!ref_blocks)
3606        goto out;
3607
3608    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3609    if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
3610        *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
3611    else {
3612        struct ocfs2_extent_tree et;
3613
3614        ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3615        *credits += ocfs2_calc_extend_credits(inode->i_sb,
3616                              et.et_root_el,
3617                              ref_blocks);
3618    }
3619
3620out:
3621    brelse(ref_leaf_bh);
3622    return ret;
3623}
3624
3625/*
3626 * Do CoW for xattr.
3627 */
3628int ocfs2_refcount_cow_xattr(struct inode *inode,
3629                 struct ocfs2_dinode *di,
3630                 struct ocfs2_xattr_value_buf *vb,
3631                 struct ocfs2_refcount_tree *ref_tree,
3632                 struct buffer_head *ref_root_bh,
3633                 u32 cpos, u32 write_len,
3634                 struct ocfs2_post_refcount *post)
3635{
3636    int ret;
3637    struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3638    struct ocfs2_inode_info *oi = OCFS2_I(inode);
3639    struct ocfs2_cow_context *context = NULL;
3640    u32 cow_start, cow_len;
3641
3642    BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3643
3644    ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3645                          cpos, write_len, UINT_MAX,
3646                          &cow_start, &cow_len);
3647    if (ret) {
3648        mlog_errno(ret);
3649        goto out;
3650    }
3651
3652    BUG_ON(cow_len == 0);
3653
3654    context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3655    if (!context) {
3656        ret = -ENOMEM;
3657        mlog_errno(ret);
3658        goto out;
3659    }
3660
3661    context->inode = inode;
3662    context->cow_start = cow_start;
3663    context->cow_len = cow_len;
3664    context->ref_tree = ref_tree;
3665    context->ref_root_bh = ref_root_bh;;
3666    context->cow_object = xv;
3667
3668    context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
3669    /* We need the extra credits for duplicate_clusters by jbd. */
3670    context->extra_credits =
3671        ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3672    context->get_clusters = ocfs2_xattr_value_get_clusters;
3673    context->post_refcount = post;
3674
3675    ocfs2_init_xattr_value_extent_tree(&context->data_et,
3676                       INODE_CACHE(inode), vb);
3677
3678    ret = ocfs2_replace_cow(context);
3679    if (ret)
3680        mlog_errno(ret);
3681
3682out:
3683    kfree(context);
3684    return ret;
3685}
3686
3687/*
3688 * Insert a new extent into refcount tree and mark a extent rec
3689 * as refcounted in the dinode tree.
3690 */
3691int ocfs2_add_refcount_flag(struct inode *inode,
3692                struct ocfs2_extent_tree *data_et,
3693                struct ocfs2_caching_info *ref_ci,
3694                struct buffer_head *ref_root_bh,
3695                u32 cpos, u32 p_cluster, u32 num_clusters,
3696                struct ocfs2_cached_dealloc_ctxt *dealloc,
3697                struct ocfs2_post_refcount *post)
3698{
3699    int ret;
3700    handle_t *handle;
3701    int credits = 1, ref_blocks = 0;
3702    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3703    struct ocfs2_alloc_context *meta_ac = NULL;
3704
3705    ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3706                           ref_ci, ref_root_bh,
3707                           p_cluster, num_clusters,
3708                           &ref_blocks, &credits);
3709    if (ret) {
3710        mlog_errno(ret);
3711        goto out;
3712    }
3713
3714    mlog(0, "reserve new metadata %d, credits = %d\n",
3715         ref_blocks, credits);
3716
3717    if (ref_blocks) {
3718        ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
3719                            ref_blocks, &meta_ac);
3720        if (ret) {
3721            mlog_errno(ret);
3722            goto out;
3723        }
3724    }
3725
3726    if (post)
3727        credits += post->credits;
3728
3729    handle = ocfs2_start_trans(osb, credits);
3730    if (IS_ERR(handle)) {
3731        ret = PTR_ERR(handle);
3732        mlog_errno(ret);
3733        goto out;
3734    }
3735
3736    ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3737                       cpos, num_clusters, p_cluster,
3738                       meta_ac, dealloc);
3739    if (ret) {
3740        mlog_errno(ret);
3741        goto out_commit;
3742    }
3743
3744    ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3745                    p_cluster, num_clusters, 0,
3746                    meta_ac, dealloc);
3747    if (ret) {
3748        mlog_errno(ret);
3749        goto out_commit;
3750    }
3751
3752    if (post && post->func) {
3753        ret = post->func(inode, handle, post->para);
3754        if (ret)
3755            mlog_errno(ret);
3756    }
3757
3758out_commit:
3759    ocfs2_commit_trans(osb, handle);
3760out:
3761    if (meta_ac)
3762        ocfs2_free_alloc_context(meta_ac);
3763    return ret;
3764}
3765
3766static int ocfs2_change_ctime(struct inode *inode,
3767                  struct buffer_head *di_bh)
3768{
3769    int ret;
3770    handle_t *handle;
3771    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3772
3773    handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3774                   OCFS2_INODE_UPDATE_CREDITS);
3775    if (IS_ERR(handle)) {
3776        ret = PTR_ERR(handle);
3777        mlog_errno(ret);
3778        goto out;
3779    }
3780
3781    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3782                      OCFS2_JOURNAL_ACCESS_WRITE);
3783    if (ret) {
3784        mlog_errno(ret);
3785        goto out_commit;
3786    }
3787
3788    inode->i_ctime = CURRENT_TIME;
3789    di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3790    di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3791
3792    ocfs2_journal_dirty(handle, di_bh);
3793
3794out_commit:
3795    ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3796out:
3797    return ret;
3798}
3799
3800static int ocfs2_attach_refcount_tree(struct inode *inode,
3801                      struct buffer_head *di_bh)
3802{
3803    int ret, data_changed = 0;
3804    struct buffer_head *ref_root_bh = NULL;
3805    struct ocfs2_inode_info *oi = OCFS2_I(inode);
3806    struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3807    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3808    struct ocfs2_refcount_tree *ref_tree;
3809    unsigned int ext_flags;
3810    loff_t size;
3811    u32 cpos, num_clusters, clusters, p_cluster;
3812    struct ocfs2_cached_dealloc_ctxt dealloc;
3813    struct ocfs2_extent_tree di_et;
3814
3815    ocfs2_init_dealloc_ctxt(&dealloc);
3816
3817    if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
3818        ret = ocfs2_create_refcount_tree(inode, di_bh);
3819        if (ret) {
3820            mlog_errno(ret);
3821            goto out;
3822        }
3823    }
3824
3825    BUG_ON(!di->i_refcount_loc);
3826    ret = ocfs2_lock_refcount_tree(osb,
3827                       le64_to_cpu(di->i_refcount_loc), 1,
3828                       &ref_tree, &ref_root_bh);
3829    if (ret) {
3830        mlog_errno(ret);
3831        goto out;
3832    }
3833
3834    if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
3835        goto attach_xattr;
3836
3837    ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3838
3839    size = i_size_read(inode);
3840    clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3841
3842    cpos = 0;
3843    while (cpos < clusters) {
3844        ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3845                     &num_clusters, &ext_flags);
3846
3847        if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3848            ret = ocfs2_add_refcount_flag(inode, &di_et,
3849                              &ref_tree->rf_ci,
3850                              ref_root_bh, cpos,
3851                              p_cluster, num_clusters,
3852                              &dealloc, NULL);
3853            if (ret) {
3854                mlog_errno(ret);
3855                goto unlock;
3856            }
3857
3858            data_changed = 1;
3859        }
3860        cpos += num_clusters;
3861    }
3862
3863attach_xattr:
3864    if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3865        ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3866                               &ref_tree->rf_ci,
3867                               ref_root_bh,
3868                               &dealloc);
3869        if (ret) {
3870            mlog_errno(ret);
3871            goto unlock;
3872        }
3873    }
3874
3875    if (data_changed) {
3876        ret = ocfs2_change_ctime(inode, di_bh);
3877        if (ret)
3878            mlog_errno(ret);
3879    }
3880
3881unlock:
3882    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3883    brelse(ref_root_bh);
3884
3885    if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3886        ocfs2_schedule_truncate_log_flush(osb, 1);
3887        ocfs2_run_deallocs(osb, &dealloc);
3888    }
3889out:
3890    /*
3891     * Empty the extent map so that we may get the right extent
3892     * record from the disk.
3893     */
3894    ocfs2_extent_map_trunc(inode, 0);
3895
3896    return ret;
3897}
3898
3899static int ocfs2_add_refcounted_extent(struct inode *inode,
3900                   struct ocfs2_extent_tree *et,
3901                   struct ocfs2_caching_info *ref_ci,
3902                   struct buffer_head *ref_root_bh,
3903                   u32 cpos, u32 p_cluster, u32 num_clusters,
3904                   unsigned int ext_flags,
3905                   struct ocfs2_cached_dealloc_ctxt *dealloc)
3906{
3907    int ret;
3908    handle_t *handle;
3909    int credits = 0;
3910    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3911    struct ocfs2_alloc_context *meta_ac = NULL;
3912
3913    ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3914                         p_cluster, num_clusters,
3915                         et, ref_ci,
3916                         ref_root_bh, &meta_ac,
3917                         NULL, &credits);
3918    if (ret) {
3919        mlog_errno(ret);
3920        goto out;
3921    }
3922
3923    handle = ocfs2_start_trans(osb, credits);
3924    if (IS_ERR(handle)) {
3925        ret = PTR_ERR(handle);
3926        mlog_errno(ret);
3927        goto out;
3928    }
3929
3930    ret = ocfs2_insert_extent(handle, et, cpos,
3931            ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
3932            num_clusters, ext_flags, meta_ac);
3933    if (ret) {
3934        mlog_errno(ret);
3935        goto out_commit;
3936    }
3937
3938    ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3939                      p_cluster, num_clusters,
3940                      meta_ac, dealloc);
3941    if (ret)
3942        mlog_errno(ret);
3943
3944out_commit:
3945    ocfs2_commit_trans(osb, handle);
3946out:
3947    if (meta_ac)
3948        ocfs2_free_alloc_context(meta_ac);
3949    return ret;
3950}
3951
3952static int ocfs2_duplicate_inline_data(struct inode *s_inode,
3953                       struct buffer_head *s_bh,
3954                       struct inode *t_inode,
3955                       struct buffer_head *t_bh)
3956{
3957    int ret;
3958    handle_t *handle;
3959    struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
3960    struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
3961    struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
3962
3963    BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
3964
3965    handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
3966    if (IS_ERR(handle)) {
3967        ret = PTR_ERR(handle);
3968        mlog_errno(ret);
3969        goto out;
3970    }
3971
3972    ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
3973                      OCFS2_JOURNAL_ACCESS_WRITE);
3974    if (ret) {
3975        mlog_errno(ret);
3976        goto out_commit;
3977    }
3978
3979    t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
3980    memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
3981           le16_to_cpu(s_di->id2.i_data.id_count));
3982    spin_lock(&OCFS2_I(t_inode)->ip_lock);
3983    OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
3984    t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
3985    spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3986
3987    ocfs2_journal_dirty(handle, t_bh);
3988
3989out_commit:
3990    ocfs2_commit_trans(osb, handle);
3991out:
3992    return ret;
3993}
3994
3995static int ocfs2_duplicate_extent_list(struct inode *s_inode,
3996                struct inode *t_inode,
3997                struct buffer_head *t_bh,
3998                struct ocfs2_caching_info *ref_ci,
3999                struct buffer_head *ref_root_bh,
4000                struct ocfs2_cached_dealloc_ctxt *dealloc)
4001{
4002    int ret = 0;
4003    u32 p_cluster, num_clusters, clusters, cpos;
4004    loff_t size;
4005    unsigned int ext_flags;
4006    struct ocfs2_extent_tree et;
4007
4008    ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
4009
4010    size = i_size_read(s_inode);
4011    clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
4012
4013    cpos = 0;
4014    while (cpos < clusters) {
4015        ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
4016                     &num_clusters, &ext_flags);
4017
4018        if (p_cluster) {
4019            ret = ocfs2_add_refcounted_extent(t_inode, &et,
4020                              ref_ci, ref_root_bh,
4021                              cpos, p_cluster,
4022                              num_clusters,
4023                              ext_flags,
4024                              dealloc);
4025            if (ret) {
4026                mlog_errno(ret);
4027                goto out;
4028            }
4029        }
4030
4031        cpos += num_clusters;
4032    }
4033
4034out:
4035    return ret;
4036}
4037
4038/*
4039 * change the new file's attributes to the src.
4040 *
4041 * reflink creates a snapshot of a file, that means the attributes
4042 * must be identical except for three exceptions - nlink, ino, and ctime.
4043 */
4044static int ocfs2_complete_reflink(struct inode *s_inode,
4045                  struct buffer_head *s_bh,
4046                  struct inode *t_inode,
4047                  struct buffer_head *t_bh,
4048                  bool preserve)
4049{
4050    int ret;
4051    handle_t *handle;
4052    struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
4053    struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
4054    loff_t size = i_size_read(s_inode);
4055
4056    handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
4057                   OCFS2_INODE_UPDATE_CREDITS);
4058    if (IS_ERR(handle)) {
4059        ret = PTR_ERR(handle);
4060        mlog_errno(ret);
4061        return ret;
4062    }
4063
4064    ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
4065                      OCFS2_JOURNAL_ACCESS_WRITE);
4066    if (ret) {
4067        mlog_errno(ret);
4068        goto out_commit;
4069    }
4070
4071    spin_lock(&OCFS2_I(t_inode)->ip_lock);
4072    OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
4073    OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
4074    OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
4075    spin_unlock(&OCFS2_I(t_inode)->ip_lock);
4076    i_size_write(t_inode, size);
4077    t_inode->i_blocks = s_inode->i_blocks;
4078
4079    di->i_xattr_inline_size = s_di->i_xattr_inline_size;
4080    di->i_clusters = s_di->i_clusters;
4081    di->i_size = s_di->i_size;
4082    di->i_dyn_features = s_di->i_dyn_features;
4083    di->i_attr = s_di->i_attr;
4084
4085    if (preserve) {
4086        di->i_uid = s_di->i_uid;
4087        di->i_gid = s_di->i_gid;
4088        di->i_mode = s_di->i_mode;
4089
4090        /*
4091         * update time.
4092         * we want mtime to appear identical to the source and
4093         * update ctime.
4094         */
4095        t_inode->i_ctime = CURRENT_TIME;
4096
4097        di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
4098        di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
4099
4100        t_inode->i_mtime = s_inode->i_mtime;
4101        di->i_mtime = s_di->i_mtime;
4102        di->i_mtime_nsec = s_di->i_mtime_nsec;
4103    }
4104
4105    ocfs2_journal_dirty(handle, t_bh);
4106
4107out_commit:
4108    ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
4109    return ret;
4110}
4111
4112static int ocfs2_create_reflink_node(struct inode *s_inode,
4113                     struct buffer_head *s_bh,
4114                     struct inode *t_inode,
4115                     struct buffer_head *t_bh,
4116                     bool preserve)
4117{
4118    int ret;
4119    struct buffer_head *ref_root_bh = NULL;
4120    struct ocfs2_cached_dealloc_ctxt dealloc;
4121    struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
4122    struct ocfs2_refcount_block *rb;
4123    struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
4124    struct ocfs2_refcount_tree *ref_tree;
4125
4126    ocfs2_init_dealloc_ctxt(&dealloc);
4127
4128    ret = ocfs2_set_refcount_tree(t_inode, t_bh,
4129                      le64_to_cpu(di->i_refcount_loc));
4130    if (ret) {
4131        mlog_errno(ret);
4132        goto out;
4133    }
4134
4135    if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4136        ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
4137                          t_inode, t_bh);
4138        if (ret)
4139            mlog_errno(ret);
4140        goto out;
4141    }
4142
4143    ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4144                       1, &ref_tree, &ref_root_bh);
4145    if (ret) {
4146        mlog_errno(ret);
4147        goto out;
4148    }
4149    rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4150
4151    ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4152                      &ref_tree->rf_ci, ref_root_bh,
4153                      &dealloc);
4154    if (ret) {
4155        mlog_errno(ret);
4156        goto out_unlock_refcount;
4157    }
4158
4159out_unlock_refcount:
4160    ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4161    brelse(ref_root_bh);
4162out:
4163    if (ocfs2_dealloc_has_cluster(&dealloc)) {
4164        ocfs2_schedule_truncate_log_flush(osb, 1);
4165        ocfs2_run_deallocs(osb, &dealloc);
4166    }
4167
4168    return ret;
4169}
4170
4171static int __ocfs2_reflink(struct dentry *old_dentry,
4172               struct buffer_head *old_bh,
4173               struct inode *new_inode,
4174               bool preserve)
4175{
4176    int ret;
4177    struct inode *inode = old_dentry->d_inode;
4178    struct buffer_head *new_bh = NULL;
4179
4180    ret = filemap_fdatawrite(inode->i_mapping);
4181    if (ret) {
4182        mlog_errno(ret);
4183        goto out;
4184    }
4185
4186    ret = ocfs2_attach_refcount_tree(inode, old_bh);
4187    if (ret) {
4188        mlog_errno(ret);
4189        goto out;
4190    }
4191
4192    mutex_lock(&new_inode->i_mutex);
4193    ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
4194    if (ret) {
4195        mlog_errno(ret);
4196        goto out_unlock;
4197    }
4198
4199    ret = ocfs2_create_reflink_node(inode, old_bh,
4200                    new_inode, new_bh, preserve);
4201    if (ret) {
4202        mlog_errno(ret);
4203        goto inode_unlock;
4204    }
4205
4206    if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4207        ret = ocfs2_reflink_xattrs(inode, old_bh,
4208                       new_inode, new_bh,
4209                       preserve);
4210        if (ret) {
4211            mlog_errno(ret);
4212            goto inode_unlock;
4213        }
4214    }
4215
4216    ret = ocfs2_complete_reflink(inode, old_bh,
4217                     new_inode, new_bh, preserve);
4218    if (ret)
4219        mlog_errno(ret);
4220
4221inode_unlock:
4222    ocfs2_inode_unlock(new_inode, 1);
4223    brelse(new_bh);
4224out_unlock:
4225    mutex_unlock(&new_inode->i_mutex);
4226out:
4227    if (!ret) {
4228        ret = filemap_fdatawait(inode->i_mapping);
4229        if (ret)
4230            mlog_errno(ret);
4231    }
4232    return ret;
4233}
4234
4235static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4236             struct dentry *new_dentry, bool preserve)
4237{
4238    int error;
4239    struct inode *inode = old_dentry->d_inode;
4240    struct buffer_head *old_bh = NULL;
4241    struct inode *new_orphan_inode = NULL;
4242
4243    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4244        return -EOPNOTSUPP;
4245
4246    error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4247                         &new_orphan_inode);
4248    if (error) {
4249        mlog_errno(error);
4250        goto out;
4251    }
4252
4253    error = ocfs2_inode_lock(inode, &old_bh, 1);
4254    if (error) {
4255        mlog_errno(error);
4256        goto out;
4257    }
4258
4259    down_write(&OCFS2_I(inode)->ip_xattr_sem);
4260    down_write(&OCFS2_I(inode)->ip_alloc_sem);
4261    error = __ocfs2_reflink(old_dentry, old_bh,
4262                new_orphan_inode, preserve);
4263    up_write(&OCFS2_I(inode)->ip_alloc_sem);
4264    up_write(&OCFS2_I(inode)->ip_xattr_sem);
4265
4266    ocfs2_inode_unlock(inode, 1);
4267    brelse(old_bh);
4268
4269    if (error) {
4270        mlog_errno(error);
4271        goto out;
4272    }
4273
4274    /* If the security isn't preserved, we need to re-initialize them. */
4275    if (!preserve) {
4276        error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
4277        if (error)
4278            mlog_errno(error);
4279    }
4280out:
4281    if (!error) {
4282        error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4283                               new_dentry);
4284        if (error)
4285            mlog_errno(error);
4286    }
4287
4288    if (new_orphan_inode) {
4289        /*
4290         * We need to open_unlock the inode no matter whether we
4291         * succeed or not, so that other nodes can delete it later.
4292         */
4293        ocfs2_open_unlock(new_orphan_inode);
4294        if (error)
4295            iput(new_orphan_inode);
4296    }
4297
4298    return error;
4299}
4300
4301/*
4302 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4303 * sys_reflink(). This will go away when vfs_reflink() exists in
4304 * fs/namei.c.
4305 */
4306
4307/* copied from may_create in VFS. */
4308static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4309{
4310    if (child->d_inode)
4311        return -EEXIST;
4312    if (IS_DEADDIR(dir))
4313        return -ENOENT;
4314    return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4315}
4316
4317/* copied from user_path_parent. */
4318static int ocfs2_user_path_parent(const char __user *path,
4319                  struct nameidata *nd, char **name)
4320{
4321    char *s = getname(path);
4322    int error;
4323
4324    if (IS_ERR(s))
4325        return PTR_ERR(s);
4326
4327    error = path_lookup(s, LOOKUP_PARENT, nd);
4328    if (error)
4329        putname(s);
4330    else
4331        *name = s;
4332
4333    return error;
4334}
4335
4336/**
4337 * ocfs2_vfs_reflink - Create a reference-counted link
4338 *
4339 * @old_dentry: source dentry + inode
4340 * @dir: directory to create the target
4341 * @new_dentry: target dentry
4342 * @preserve: if true, preserve all file attributes
4343 */
4344static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4345                 struct dentry *new_dentry, bool preserve)
4346{
4347    struct inode *inode = old_dentry->d_inode;
4348    int error;
4349
4350    if (!inode)
4351        return -ENOENT;
4352
4353    error = ocfs2_may_create(dir, new_dentry);
4354    if (error)
4355        return error;
4356
4357    if (dir->i_sb != inode->i_sb)
4358        return -EXDEV;
4359
4360    /*
4361     * A reflink to an append-only or immutable file cannot be created.
4362     */
4363    if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4364        return -EPERM;
4365
4366    /* Only regular files can be reflinked. */
4367    if (!S_ISREG(inode->i_mode))
4368        return -EPERM;
4369
4370    /*
4371     * If the caller wants to preserve ownership, they require the
4372     * rights to do so.
4373     */
4374    if (preserve) {
4375        if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
4376            return -EPERM;
4377        if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4378            return -EPERM;
4379    }
4380
4381    /*
4382     * If the caller is modifying any aspect of the attributes, they
4383     * are not creating a snapshot. They need read permission on the
4384     * file.
4385     */
4386    if (!preserve) {
4387        error = inode_permission(inode, MAY_READ);
4388        if (error)
4389            return error;
4390    }
4391
4392    mutex_lock(&inode->i_mutex);
4393    dquot_initialize(dir);
4394    error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4395    mutex_unlock(&inode->i_mutex);
4396    if (!error)
4397        fsnotify_create(dir, new_dentry);
4398    return error;
4399}
4400/*
4401 * Most codes are copied from sys_linkat.
4402 */
4403int ocfs2_reflink_ioctl(struct inode *inode,
4404            const char __user *oldname,
4405            const char __user *newname,
4406            bool preserve)
4407{
4408    struct dentry *new_dentry;
4409    struct nameidata nd;
4410    struct path old_path;
4411    int error;
4412    char *to = NULL;
4413
4414    if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4415        return -EOPNOTSUPP;
4416
4417    error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4418    if (error) {
4419        mlog_errno(error);
4420        return error;
4421    }
4422
4423    error = ocfs2_user_path_parent(newname, &nd, &to);
4424    if (error) {
4425        mlog_errno(error);
4426        goto out;
4427    }
4428
4429    error = -EXDEV;
4430    if (old_path.mnt != nd.path.mnt)
4431        goto out_release;
4432    new_dentry = lookup_create(&nd, 0);
4433    error = PTR_ERR(new_dentry);
4434    if (IS_ERR(new_dentry)) {
4435        mlog_errno(error);
4436        goto out_unlock;
4437    }
4438
4439    error = mnt_want_write(nd.path.mnt);
4440    if (error) {
4441        mlog_errno(error);
4442        goto out_dput;
4443    }
4444
4445    error = ocfs2_vfs_reflink(old_path.dentry,
4446                  nd.path.dentry->d_inode,
4447                  new_dentry, preserve);
4448    mnt_drop_write(nd.path.mnt);
4449out_dput:
4450    dput(new_dentry);
4451out_unlock:
4452    mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
4453out_release:
4454    path_put(&nd.path);
4455    putname(to);
4456out:
4457    path_put(&old_path);
4458
4459    return error;
4460}
4461

Archive Download this file



interactive