Root/fs/btrfs/extent-tree.c

1/*
2 * Copyright (C) 2007 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#include <linux/sched.h>
19#include <linux/pagemap.h>
20#include <linux/writeback.h>
21#include <linux/blkdev.h>
22#include <linux/sort.h>
23#include <linux/rcupdate.h>
24#include <linux/kthread.h>
25#include <linux/slab.h>
26#include "compat.h"
27#include "hash.h"
28#include "ctree.h"
29#include "disk-io.h"
30#include "print-tree.h"
31#include "transaction.h"
32#include "volumes.h"
33#include "locking.h"
34#include "free-space-cache.h"
35
36static int update_block_group(struct btrfs_trans_handle *trans,
37                  struct btrfs_root *root,
38                  u64 bytenr, u64 num_bytes, int alloc,
39                  int mark_free);
40static int update_reserved_extents(struct btrfs_block_group_cache *cache,
41                   u64 num_bytes, int reserve);
42static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
43                struct btrfs_root *root,
44                u64 bytenr, u64 num_bytes, u64 parent,
45                u64 root_objectid, u64 owner_objectid,
46                u64 owner_offset, int refs_to_drop,
47                struct btrfs_delayed_extent_op *extra_op);
48static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
49                    struct extent_buffer *leaf,
50                    struct btrfs_extent_item *ei);
51static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
52                      struct btrfs_root *root,
53                      u64 parent, u64 root_objectid,
54                      u64 flags, u64 owner, u64 offset,
55                      struct btrfs_key *ins, int ref_mod);
56static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
57                     struct btrfs_root *root,
58                     u64 parent, u64 root_objectid,
59                     u64 flags, struct btrfs_disk_key *key,
60                     int level, struct btrfs_key *ins);
61static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62              struct btrfs_root *extent_root, u64 alloc_bytes,
63              u64 flags, int force);
64static int pin_down_bytes(struct btrfs_trans_handle *trans,
65              struct btrfs_root *root,
66              struct btrfs_path *path,
67              u64 bytenr, u64 num_bytes,
68              int is_data, int reserved,
69              struct extent_buffer **must_clean);
70static int find_next_key(struct btrfs_path *path, int level,
71             struct btrfs_key *key);
72static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
73                int dump_block_groups);
74
75static noinline int
76block_group_cache_done(struct btrfs_block_group_cache *cache)
77{
78    smp_mb();
79    return cache->cached == BTRFS_CACHE_FINISHED;
80}
81
82static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
83{
84    return (cache->flags & bits) == bits;
85}
86
87void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
88{
89    atomic_inc(&cache->count);
90}
91
92void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
93{
94    if (atomic_dec_and_test(&cache->count))
95        kfree(cache);
96}
97
98/*
99 * this adds the block group to the fs_info rb tree for the block group
100 * cache
101 */
102static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
103                struct btrfs_block_group_cache *block_group)
104{
105    struct rb_node **p;
106    struct rb_node *parent = NULL;
107    struct btrfs_block_group_cache *cache;
108
109    spin_lock(&info->block_group_cache_lock);
110    p = &info->block_group_cache_tree.rb_node;
111
112    while (*p) {
113        parent = *p;
114        cache = rb_entry(parent, struct btrfs_block_group_cache,
115                 cache_node);
116        if (block_group->key.objectid < cache->key.objectid) {
117            p = &(*p)->rb_left;
118        } else if (block_group->key.objectid > cache->key.objectid) {
119            p = &(*p)->rb_right;
120        } else {
121            spin_unlock(&info->block_group_cache_lock);
122            return -EEXIST;
123        }
124    }
125
126    rb_link_node(&block_group->cache_node, parent, p);
127    rb_insert_color(&block_group->cache_node,
128            &info->block_group_cache_tree);
129    spin_unlock(&info->block_group_cache_lock);
130
131    return 0;
132}
133
134/*
135 * This will return the block group at or after bytenr if contains is 0, else
136 * it will return the block group that contains the bytenr
137 */
138static struct btrfs_block_group_cache *
139block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
140                  int contains)
141{
142    struct btrfs_block_group_cache *cache, *ret = NULL;
143    struct rb_node *n;
144    u64 end, start;
145
146    spin_lock(&info->block_group_cache_lock);
147    n = info->block_group_cache_tree.rb_node;
148
149    while (n) {
150        cache = rb_entry(n, struct btrfs_block_group_cache,
151                 cache_node);
152        end = cache->key.objectid + cache->key.offset - 1;
153        start = cache->key.objectid;
154
155        if (bytenr < start) {
156            if (!contains && (!ret || start < ret->key.objectid))
157                ret = cache;
158            n = n->rb_left;
159        } else if (bytenr > start) {
160            if (contains && bytenr <= end) {
161                ret = cache;
162                break;
163            }
164            n = n->rb_right;
165        } else {
166            ret = cache;
167            break;
168        }
169    }
170    if (ret)
171        btrfs_get_block_group(ret);
172    spin_unlock(&info->block_group_cache_lock);
173
174    return ret;
175}
176
177static int add_excluded_extent(struct btrfs_root *root,
178                   u64 start, u64 num_bytes)
179{
180    u64 end = start + num_bytes - 1;
181    set_extent_bits(&root->fs_info->freed_extents[0],
182            start, end, EXTENT_UPTODATE, GFP_NOFS);
183    set_extent_bits(&root->fs_info->freed_extents[1],
184            start, end, EXTENT_UPTODATE, GFP_NOFS);
185    return 0;
186}
187
188static void free_excluded_extents(struct btrfs_root *root,
189                  struct btrfs_block_group_cache *cache)
190{
191    u64 start, end;
192
193    start = cache->key.objectid;
194    end = start + cache->key.offset - 1;
195
196    clear_extent_bits(&root->fs_info->freed_extents[0],
197              start, end, EXTENT_UPTODATE, GFP_NOFS);
198    clear_extent_bits(&root->fs_info->freed_extents[1],
199              start, end, EXTENT_UPTODATE, GFP_NOFS);
200}
201
202static int exclude_super_stripes(struct btrfs_root *root,
203                 struct btrfs_block_group_cache *cache)
204{
205    u64 bytenr;
206    u64 *logical;
207    int stripe_len;
208    int i, nr, ret;
209
210    if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
211        stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
212        cache->bytes_super += stripe_len;
213        ret = add_excluded_extent(root, cache->key.objectid,
214                      stripe_len);
215        BUG_ON(ret);
216    }
217
218    for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
219        bytenr = btrfs_sb_offset(i);
220        ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
221                       cache->key.objectid, bytenr,
222                       0, &logical, &nr, &stripe_len);
223        BUG_ON(ret);
224
225        while (nr--) {
226            cache->bytes_super += stripe_len;
227            ret = add_excluded_extent(root, logical[nr],
228                          stripe_len);
229            BUG_ON(ret);
230        }
231
232        kfree(logical);
233    }
234    return 0;
235}
236
237static struct btrfs_caching_control *
238get_caching_control(struct btrfs_block_group_cache *cache)
239{
240    struct btrfs_caching_control *ctl;
241
242    spin_lock(&cache->lock);
243    if (cache->cached != BTRFS_CACHE_STARTED) {
244        spin_unlock(&cache->lock);
245        return NULL;
246    }
247
248    ctl = cache->caching_ctl;
249    atomic_inc(&ctl->count);
250    spin_unlock(&cache->lock);
251    return ctl;
252}
253
254static void put_caching_control(struct btrfs_caching_control *ctl)
255{
256    if (atomic_dec_and_test(&ctl->count))
257        kfree(ctl);
258}
259
260/*
261 * this is only called by cache_block_group, since we could have freed extents
262 * we need to check the pinned_extents for any extents that can't be used yet
263 * since their free space will be released as soon as the transaction commits.
264 */
265static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
266                  struct btrfs_fs_info *info, u64 start, u64 end)
267{
268    u64 extent_start, extent_end, size, total_added = 0;
269    int ret;
270
271    while (start < end) {
272        ret = find_first_extent_bit(info->pinned_extents, start,
273                        &extent_start, &extent_end,
274                        EXTENT_DIRTY | EXTENT_UPTODATE);
275        if (ret)
276            break;
277
278        if (extent_start <= start) {
279            start = extent_end + 1;
280        } else if (extent_start > start && extent_start < end) {
281            size = extent_start - start;
282            total_added += size;
283            ret = btrfs_add_free_space(block_group, start,
284                           size);
285            BUG_ON(ret);
286            start = extent_end + 1;
287        } else {
288            break;
289        }
290    }
291
292    if (start < end) {
293        size = end - start;
294        total_added += size;
295        ret = btrfs_add_free_space(block_group, start, size);
296        BUG_ON(ret);
297    }
298
299    return total_added;
300}
301
302static int caching_kthread(void *data)
303{
304    struct btrfs_block_group_cache *block_group = data;
305    struct btrfs_fs_info *fs_info = block_group->fs_info;
306    struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
307    struct btrfs_root *extent_root = fs_info->extent_root;
308    struct btrfs_path *path;
309    struct extent_buffer *leaf;
310    struct btrfs_key key;
311    u64 total_found = 0;
312    u64 last = 0;
313    u32 nritems;
314    int ret = 0;
315
316    path = btrfs_alloc_path();
317    if (!path)
318        return -ENOMEM;
319
320    exclude_super_stripes(extent_root, block_group);
321    spin_lock(&block_group->space_info->lock);
322    block_group->space_info->bytes_super += block_group->bytes_super;
323    spin_unlock(&block_group->space_info->lock);
324
325    last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
326
327    /*
328     * We don't want to deadlock with somebody trying to allocate a new
329     * extent for the extent root while also trying to search the extent
330     * root to add free space. So we skip locking and search the commit
331     * root, since its read-only
332     */
333    path->skip_locking = 1;
334    path->search_commit_root = 1;
335    path->reada = 2;
336
337    key.objectid = last;
338    key.offset = 0;
339    key.type = BTRFS_EXTENT_ITEM_KEY;
340again:
341    mutex_lock(&caching_ctl->mutex);
342    /* need to make sure the commit_root doesn't disappear */
343    down_read(&fs_info->extent_commit_sem);
344
345    ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
346    if (ret < 0)
347        goto err;
348
349    leaf = path->nodes[0];
350    nritems = btrfs_header_nritems(leaf);
351
352    while (1) {
353        smp_mb();
354        if (fs_info->closing > 1) {
355            last = (u64)-1;
356            break;
357        }
358
359        if (path->slots[0] < nritems) {
360            btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
361        } else {
362            ret = find_next_key(path, 0, &key);
363            if (ret)
364                break;
365
366            caching_ctl->progress = last;
367            btrfs_release_path(extent_root, path);
368            up_read(&fs_info->extent_commit_sem);
369            mutex_unlock(&caching_ctl->mutex);
370            if (btrfs_transaction_in_commit(fs_info))
371                schedule_timeout(1);
372            else
373                cond_resched();
374            goto again;
375        }
376
377        if (key.objectid < block_group->key.objectid) {
378            path->slots[0]++;
379            continue;
380        }
381
382        if (key.objectid >= block_group->key.objectid +
383            block_group->key.offset)
384            break;
385
386        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
387            total_found += add_new_free_space(block_group,
388                              fs_info, last,
389                              key.objectid);
390            last = key.objectid + key.offset;
391
392            if (total_found > (1024 * 1024 * 2)) {
393                total_found = 0;
394                wake_up(&caching_ctl->wait);
395            }
396        }
397        path->slots[0]++;
398    }
399    ret = 0;
400
401    total_found += add_new_free_space(block_group, fs_info, last,
402                      block_group->key.objectid +
403                      block_group->key.offset);
404    caching_ctl->progress = (u64)-1;
405
406    spin_lock(&block_group->lock);
407    block_group->caching_ctl = NULL;
408    block_group->cached = BTRFS_CACHE_FINISHED;
409    spin_unlock(&block_group->lock);
410
411err:
412    btrfs_free_path(path);
413    up_read(&fs_info->extent_commit_sem);
414
415    free_excluded_extents(extent_root, block_group);
416
417    mutex_unlock(&caching_ctl->mutex);
418    wake_up(&caching_ctl->wait);
419
420    put_caching_control(caching_ctl);
421    atomic_dec(&block_group->space_info->caching_threads);
422    btrfs_put_block_group(block_group);
423
424    return 0;
425}
426
427static int cache_block_group(struct btrfs_block_group_cache *cache)
428{
429    struct btrfs_fs_info *fs_info = cache->fs_info;
430    struct btrfs_caching_control *caching_ctl;
431    struct task_struct *tsk;
432    int ret = 0;
433
434    smp_mb();
435    if (cache->cached != BTRFS_CACHE_NO)
436        return 0;
437
438    caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
439    BUG_ON(!caching_ctl);
440
441    INIT_LIST_HEAD(&caching_ctl->list);
442    mutex_init(&caching_ctl->mutex);
443    init_waitqueue_head(&caching_ctl->wait);
444    caching_ctl->block_group = cache;
445    caching_ctl->progress = cache->key.objectid;
446    /* one for caching kthread, one for caching block group list */
447    atomic_set(&caching_ctl->count, 2);
448
449    spin_lock(&cache->lock);
450    if (cache->cached != BTRFS_CACHE_NO) {
451        spin_unlock(&cache->lock);
452        kfree(caching_ctl);
453        return 0;
454    }
455    cache->caching_ctl = caching_ctl;
456    cache->cached = BTRFS_CACHE_STARTED;
457    spin_unlock(&cache->lock);
458
459    down_write(&fs_info->extent_commit_sem);
460    list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
461    up_write(&fs_info->extent_commit_sem);
462
463    atomic_inc(&cache->space_info->caching_threads);
464    btrfs_get_block_group(cache);
465
466    tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
467              cache->key.objectid);
468    if (IS_ERR(tsk)) {
469        ret = PTR_ERR(tsk);
470        printk(KERN_ERR "error running thread %d\n", ret);
471        BUG();
472    }
473
474    return ret;
475}
476
477/*
478 * return the block group that starts at or after bytenr
479 */
480static struct btrfs_block_group_cache *
481btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
482{
483    struct btrfs_block_group_cache *cache;
484
485    cache = block_group_cache_tree_search(info, bytenr, 0);
486
487    return cache;
488}
489
490/*
491 * return the block group that contains the given bytenr
492 */
493struct btrfs_block_group_cache *btrfs_lookup_block_group(
494                         struct btrfs_fs_info *info,
495                         u64 bytenr)
496{
497    struct btrfs_block_group_cache *cache;
498
499    cache = block_group_cache_tree_search(info, bytenr, 1);
500
501    return cache;
502}
503
504static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
505                          u64 flags)
506{
507    struct list_head *head = &info->space_info;
508    struct btrfs_space_info *found;
509
510    rcu_read_lock();
511    list_for_each_entry_rcu(found, head, list) {
512        if (found->flags == flags) {
513            rcu_read_unlock();
514            return found;
515        }
516    }
517    rcu_read_unlock();
518    return NULL;
519}
520
521/*
522 * after adding space to the filesystem, we need to clear the full flags
523 * on all the space infos.
524 */
525void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
526{
527    struct list_head *head = &info->space_info;
528    struct btrfs_space_info *found;
529
530    rcu_read_lock();
531    list_for_each_entry_rcu(found, head, list)
532        found->full = 0;
533    rcu_read_unlock();
534}
535
536static u64 div_factor(u64 num, int factor)
537{
538    if (factor == 10)
539        return num;
540    num *= factor;
541    do_div(num, 10);
542    return num;
543}
544
545u64 btrfs_find_block_group(struct btrfs_root *root,
546               u64 search_start, u64 search_hint, int owner)
547{
548    struct btrfs_block_group_cache *cache;
549    u64 used;
550    u64 last = max(search_hint, search_start);
551    u64 group_start = 0;
552    int full_search = 0;
553    int factor = 9;
554    int wrapped = 0;
555again:
556    while (1) {
557        cache = btrfs_lookup_first_block_group(root->fs_info, last);
558        if (!cache)
559            break;
560
561        spin_lock(&cache->lock);
562        last = cache->key.objectid + cache->key.offset;
563        used = btrfs_block_group_used(&cache->item);
564
565        if ((full_search || !cache->ro) &&
566            block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
567            if (used + cache->pinned + cache->reserved <
568                div_factor(cache->key.offset, factor)) {
569                group_start = cache->key.objectid;
570                spin_unlock(&cache->lock);
571                btrfs_put_block_group(cache);
572                goto found;
573            }
574        }
575        spin_unlock(&cache->lock);
576        btrfs_put_block_group(cache);
577        cond_resched();
578    }
579    if (!wrapped) {
580        last = search_start;
581        wrapped = 1;
582        goto again;
583    }
584    if (!full_search && factor < 10) {
585        last = search_start;
586        full_search = 1;
587        factor = 10;
588        goto again;
589    }
590found:
591    return group_start;
592}
593
594/* simple helper to search for an existing extent at a given offset */
595int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
596{
597    int ret;
598    struct btrfs_key key;
599    struct btrfs_path *path;
600
601    path = btrfs_alloc_path();
602    BUG_ON(!path);
603    key.objectid = start;
604    key.offset = len;
605    btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
606    ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
607                0, 0);
608    btrfs_free_path(path);
609    return ret;
610}
611
612/*
613 * Back reference rules. Back refs have three main goals:
614 *
615 * 1) differentiate between all holders of references to an extent so that
616 * when a reference is dropped we can make sure it was a valid reference
617 * before freeing the extent.
618 *
619 * 2) Provide enough information to quickly find the holders of an extent
620 * if we notice a given block is corrupted or bad.
621 *
622 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
623 * maintenance. This is actually the same as #2, but with a slightly
624 * different use case.
625 *
626 * There are two kinds of back refs. The implicit back refs is optimized
627 * for pointers in non-shared tree blocks. For a given pointer in a block,
628 * back refs of this kind provide information about the block's owner tree
629 * and the pointer's key. These information allow us to find the block by
630 * b-tree searching. The full back refs is for pointers in tree blocks not
631 * referenced by their owner trees. The location of tree block is recorded
632 * in the back refs. Actually the full back refs is generic, and can be
633 * used in all cases the implicit back refs is used. The major shortcoming
634 * of the full back refs is its overhead. Every time a tree block gets
635 * COWed, we have to update back refs entry for all pointers in it.
636 *
637 * For a newly allocated tree block, we use implicit back refs for
638 * pointers in it. This means most tree related operations only involve
639 * implicit back refs. For a tree block created in old transaction, the
640 * only way to drop a reference to it is COW it. So we can detect the
641 * event that tree block loses its owner tree's reference and do the
642 * back refs conversion.
643 *
644 * When a tree block is COW'd through a tree, there are four cases:
645 *
646 * The reference count of the block is one and the tree is the block's
647 * owner tree. Nothing to do in this case.
648 *
649 * The reference count of the block is one and the tree is not the
650 * block's owner tree. In this case, full back refs is used for pointers
651 * in the block. Remove these full back refs, add implicit back refs for
652 * every pointers in the new block.
653 *
654 * The reference count of the block is greater than one and the tree is
655 * the block's owner tree. In this case, implicit back refs is used for
656 * pointers in the block. Add full back refs for every pointers in the
657 * block, increase lower level extents' reference counts. The original
658 * implicit back refs are entailed to the new block.
659 *
660 * The reference count of the block is greater than one and the tree is
661 * not the block's owner tree. Add implicit back refs for every pointer in
662 * the new block, increase lower level extents' reference count.
663 *
664 * Back Reference Key composing:
665 *
666 * The key objectid corresponds to the first byte in the extent,
667 * The key type is used to differentiate between types of back refs.
668 * There are different meanings of the key offset for different types
669 * of back refs.
670 *
671 * File extents can be referenced by:
672 *
673 * - multiple snapshots, subvolumes, or different generations in one subvol
674 * - different files inside a single subvolume
675 * - different offsets inside a file (bookend extents in file.c)
676 *
677 * The extent ref structure for the implicit back refs has fields for:
678 *
679 * - Objectid of the subvolume root
680 * - objectid of the file holding the reference
681 * - original offset in the file
682 * - how many bookend extents
683 *
684 * The key offset for the implicit back refs is hash of the first
685 * three fields.
686 *
687 * The extent ref structure for the full back refs has field for:
688 *
689 * - number of pointers in the tree leaf
690 *
691 * The key offset for the implicit back refs is the first byte of
692 * the tree leaf
693 *
694 * When a file extent is allocated, The implicit back refs is used.
695 * the fields are filled in:
696 *
697 * (root_key.objectid, inode objectid, offset in file, 1)
698 *
699 * When a file extent is removed file truncation, we find the
700 * corresponding implicit back refs and check the following fields:
701 *
702 * (btrfs_header_owner(leaf), inode objectid, offset in file)
703 *
704 * Btree extents can be referenced by:
705 *
706 * - Different subvolumes
707 *
708 * Both the implicit back refs and the full back refs for tree blocks
709 * only consist of key. The key offset for the implicit back refs is
710 * objectid of block's owner tree. The key offset for the full back refs
711 * is the first byte of parent block.
712 *
713 * When implicit back refs is used, information about the lowest key and
714 * level of the tree block are required. These information are stored in
715 * tree block info structure.
716 */
717
718#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
719static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
720                  struct btrfs_root *root,
721                  struct btrfs_path *path,
722                  u64 owner, u32 extra_size)
723{
724    struct btrfs_extent_item *item;
725    struct btrfs_extent_item_v0 *ei0;
726    struct btrfs_extent_ref_v0 *ref0;
727    struct btrfs_tree_block_info *bi;
728    struct extent_buffer *leaf;
729    struct btrfs_key key;
730    struct btrfs_key found_key;
731    u32 new_size = sizeof(*item);
732    u64 refs;
733    int ret;
734
735    leaf = path->nodes[0];
736    BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
737
738    btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
739    ei0 = btrfs_item_ptr(leaf, path->slots[0],
740                 struct btrfs_extent_item_v0);
741    refs = btrfs_extent_refs_v0(leaf, ei0);
742
743    if (owner == (u64)-1) {
744        while (1) {
745            if (path->slots[0] >= btrfs_header_nritems(leaf)) {
746                ret = btrfs_next_leaf(root, path);
747                if (ret < 0)
748                    return ret;
749                BUG_ON(ret > 0);
750                leaf = path->nodes[0];
751            }
752            btrfs_item_key_to_cpu(leaf, &found_key,
753                          path->slots[0]);
754            BUG_ON(key.objectid != found_key.objectid);
755            if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
756                path->slots[0]++;
757                continue;
758            }
759            ref0 = btrfs_item_ptr(leaf, path->slots[0],
760                          struct btrfs_extent_ref_v0);
761            owner = btrfs_ref_objectid_v0(leaf, ref0);
762            break;
763        }
764    }
765    btrfs_release_path(root, path);
766
767    if (owner < BTRFS_FIRST_FREE_OBJECTID)
768        new_size += sizeof(*bi);
769
770    new_size -= sizeof(*ei0);
771    ret = btrfs_search_slot(trans, root, &key, path,
772                new_size + extra_size, 1);
773    if (ret < 0)
774        return ret;
775    BUG_ON(ret);
776
777    ret = btrfs_extend_item(trans, root, path, new_size);
778    BUG_ON(ret);
779
780    leaf = path->nodes[0];
781    item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
782    btrfs_set_extent_refs(leaf, item, refs);
783    /* FIXME: get real generation */
784    btrfs_set_extent_generation(leaf, item, 0);
785    if (owner < BTRFS_FIRST_FREE_OBJECTID) {
786        btrfs_set_extent_flags(leaf, item,
787                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
788                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
789        bi = (struct btrfs_tree_block_info *)(item + 1);
790        /* FIXME: get first key of the block */
791        memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
792        btrfs_set_tree_block_level(leaf, bi, (int)owner);
793    } else {
794        btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
795    }
796    btrfs_mark_buffer_dirty(leaf);
797    return 0;
798}
799#endif
800
801static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
802{
803    u32 high_crc = ~(u32)0;
804    u32 low_crc = ~(u32)0;
805    __le64 lenum;
806
807    lenum = cpu_to_le64(root_objectid);
808    high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
809    lenum = cpu_to_le64(owner);
810    low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
811    lenum = cpu_to_le64(offset);
812    low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
813
814    return ((u64)high_crc << 31) ^ (u64)low_crc;
815}
816
817static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
818                     struct btrfs_extent_data_ref *ref)
819{
820    return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
821                    btrfs_extent_data_ref_objectid(leaf, ref),
822                    btrfs_extent_data_ref_offset(leaf, ref));
823}
824
825static int match_extent_data_ref(struct extent_buffer *leaf,
826                 struct btrfs_extent_data_ref *ref,
827                 u64 root_objectid, u64 owner, u64 offset)
828{
829    if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
830        btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
831        btrfs_extent_data_ref_offset(leaf, ref) != offset)
832        return 0;
833    return 1;
834}
835
836static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
837                       struct btrfs_root *root,
838                       struct btrfs_path *path,
839                       u64 bytenr, u64 parent,
840                       u64 root_objectid,
841                       u64 owner, u64 offset)
842{
843    struct btrfs_key key;
844    struct btrfs_extent_data_ref *ref;
845    struct extent_buffer *leaf;
846    u32 nritems;
847    int ret;
848    int recow;
849    int err = -ENOENT;
850
851    key.objectid = bytenr;
852    if (parent) {
853        key.type = BTRFS_SHARED_DATA_REF_KEY;
854        key.offset = parent;
855    } else {
856        key.type = BTRFS_EXTENT_DATA_REF_KEY;
857        key.offset = hash_extent_data_ref(root_objectid,
858                          owner, offset);
859    }
860again:
861    recow = 0;
862    ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
863    if (ret < 0) {
864        err = ret;
865        goto fail;
866    }
867
868    if (parent) {
869        if (!ret)
870            return 0;
871#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
872        key.type = BTRFS_EXTENT_REF_V0_KEY;
873        btrfs_release_path(root, path);
874        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
875        if (ret < 0) {
876            err = ret;
877            goto fail;
878        }
879        if (!ret)
880            return 0;
881#endif
882        goto fail;
883    }
884
885    leaf = path->nodes[0];
886    nritems = btrfs_header_nritems(leaf);
887    while (1) {
888        if (path->slots[0] >= nritems) {
889            ret = btrfs_next_leaf(root, path);
890            if (ret < 0)
891                err = ret;
892            if (ret)
893                goto fail;
894
895            leaf = path->nodes[0];
896            nritems = btrfs_header_nritems(leaf);
897            recow = 1;
898        }
899
900        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
901        if (key.objectid != bytenr ||
902            key.type != BTRFS_EXTENT_DATA_REF_KEY)
903            goto fail;
904
905        ref = btrfs_item_ptr(leaf, path->slots[0],
906                     struct btrfs_extent_data_ref);
907
908        if (match_extent_data_ref(leaf, ref, root_objectid,
909                      owner, offset)) {
910            if (recow) {
911                btrfs_release_path(root, path);
912                goto again;
913            }
914            err = 0;
915            break;
916        }
917        path->slots[0]++;
918    }
919fail:
920    return err;
921}
922
923static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
924                       struct btrfs_root *root,
925                       struct btrfs_path *path,
926                       u64 bytenr, u64 parent,
927                       u64 root_objectid, u64 owner,
928                       u64 offset, int refs_to_add)
929{
930    struct btrfs_key key;
931    struct extent_buffer *leaf;
932    u32 size;
933    u32 num_refs;
934    int ret;
935
936    key.objectid = bytenr;
937    if (parent) {
938        key.type = BTRFS_SHARED_DATA_REF_KEY;
939        key.offset = parent;
940        size = sizeof(struct btrfs_shared_data_ref);
941    } else {
942        key.type = BTRFS_EXTENT_DATA_REF_KEY;
943        key.offset = hash_extent_data_ref(root_objectid,
944                          owner, offset);
945        size = sizeof(struct btrfs_extent_data_ref);
946    }
947
948    ret = btrfs_insert_empty_item(trans, root, path, &key, size);
949    if (ret && ret != -EEXIST)
950        goto fail;
951
952    leaf = path->nodes[0];
953    if (parent) {
954        struct btrfs_shared_data_ref *ref;
955        ref = btrfs_item_ptr(leaf, path->slots[0],
956                     struct btrfs_shared_data_ref);
957        if (ret == 0) {
958            btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
959        } else {
960            num_refs = btrfs_shared_data_ref_count(leaf, ref);
961            num_refs += refs_to_add;
962            btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
963        }
964    } else {
965        struct btrfs_extent_data_ref *ref;
966        while (ret == -EEXIST) {
967            ref = btrfs_item_ptr(leaf, path->slots[0],
968                         struct btrfs_extent_data_ref);
969            if (match_extent_data_ref(leaf, ref, root_objectid,
970                          owner, offset))
971                break;
972            btrfs_release_path(root, path);
973            key.offset++;
974            ret = btrfs_insert_empty_item(trans, root, path, &key,
975                              size);
976            if (ret && ret != -EEXIST)
977                goto fail;
978
979            leaf = path->nodes[0];
980        }
981        ref = btrfs_item_ptr(leaf, path->slots[0],
982                     struct btrfs_extent_data_ref);
983        if (ret == 0) {
984            btrfs_set_extent_data_ref_root(leaf, ref,
985                               root_objectid);
986            btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
987            btrfs_set_extent_data_ref_offset(leaf, ref, offset);
988            btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
989        } else {
990            num_refs = btrfs_extent_data_ref_count(leaf, ref);
991            num_refs += refs_to_add;
992            btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
993        }
994    }
995    btrfs_mark_buffer_dirty(leaf);
996    ret = 0;
997fail:
998    btrfs_release_path(root, path);
999    return ret;
1000}
1001
1002static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1003                       struct btrfs_root *root,
1004                       struct btrfs_path *path,
1005                       int refs_to_drop)
1006{
1007    struct btrfs_key key;
1008    struct btrfs_extent_data_ref *ref1 = NULL;
1009    struct btrfs_shared_data_ref *ref2 = NULL;
1010    struct extent_buffer *leaf;
1011    u32 num_refs = 0;
1012    int ret = 0;
1013
1014    leaf = path->nodes[0];
1015    btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1016
1017    if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1018        ref1 = btrfs_item_ptr(leaf, path->slots[0],
1019                      struct btrfs_extent_data_ref);
1020        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1021    } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1022        ref2 = btrfs_item_ptr(leaf, path->slots[0],
1023                      struct btrfs_shared_data_ref);
1024        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1025#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1026    } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1027        struct btrfs_extent_ref_v0 *ref0;
1028        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1029                      struct btrfs_extent_ref_v0);
1030        num_refs = btrfs_ref_count_v0(leaf, ref0);
1031#endif
1032    } else {
1033        BUG();
1034    }
1035
1036    BUG_ON(num_refs < refs_to_drop);
1037    num_refs -= refs_to_drop;
1038
1039    if (num_refs == 0) {
1040        ret = btrfs_del_item(trans, root, path);
1041    } else {
1042        if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1043            btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1044        else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1045            btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1046#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1047        else {
1048            struct btrfs_extent_ref_v0 *ref0;
1049            ref0 = btrfs_item_ptr(leaf, path->slots[0],
1050                    struct btrfs_extent_ref_v0);
1051            btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1052        }
1053#endif
1054        btrfs_mark_buffer_dirty(leaf);
1055    }
1056    return ret;
1057}
1058
1059static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1060                      struct btrfs_path *path,
1061                      struct btrfs_extent_inline_ref *iref)
1062{
1063    struct btrfs_key key;
1064    struct extent_buffer *leaf;
1065    struct btrfs_extent_data_ref *ref1;
1066    struct btrfs_shared_data_ref *ref2;
1067    u32 num_refs = 0;
1068
1069    leaf = path->nodes[0];
1070    btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1071    if (iref) {
1072        if (btrfs_extent_inline_ref_type(leaf, iref) ==
1073            BTRFS_EXTENT_DATA_REF_KEY) {
1074            ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1075            num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1076        } else {
1077            ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1078            num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1079        }
1080    } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1081        ref1 = btrfs_item_ptr(leaf, path->slots[0],
1082                      struct btrfs_extent_data_ref);
1083        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1084    } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1085        ref2 = btrfs_item_ptr(leaf, path->slots[0],
1086                      struct btrfs_shared_data_ref);
1087        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1088#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1089    } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1090        struct btrfs_extent_ref_v0 *ref0;
1091        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1092                      struct btrfs_extent_ref_v0);
1093        num_refs = btrfs_ref_count_v0(leaf, ref0);
1094#endif
1095    } else {
1096        WARN_ON(1);
1097    }
1098    return num_refs;
1099}
1100
1101static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1102                      struct btrfs_root *root,
1103                      struct btrfs_path *path,
1104                      u64 bytenr, u64 parent,
1105                      u64 root_objectid)
1106{
1107    struct btrfs_key key;
1108    int ret;
1109
1110    key.objectid = bytenr;
1111    if (parent) {
1112        key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1113        key.offset = parent;
1114    } else {
1115        key.type = BTRFS_TREE_BLOCK_REF_KEY;
1116        key.offset = root_objectid;
1117    }
1118
1119    ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1120    if (ret > 0)
1121        ret = -ENOENT;
1122#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1123    if (ret == -ENOENT && parent) {
1124        btrfs_release_path(root, path);
1125        key.type = BTRFS_EXTENT_REF_V0_KEY;
1126        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1127        if (ret > 0)
1128            ret = -ENOENT;
1129    }
1130#endif
1131    return ret;
1132}
1133
1134static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1135                      struct btrfs_root *root,
1136                      struct btrfs_path *path,
1137                      u64 bytenr, u64 parent,
1138                      u64 root_objectid)
1139{
1140    struct btrfs_key key;
1141    int ret;
1142
1143    key.objectid = bytenr;
1144    if (parent) {
1145        key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1146        key.offset = parent;
1147    } else {
1148        key.type = BTRFS_TREE_BLOCK_REF_KEY;
1149        key.offset = root_objectid;
1150    }
1151
1152    ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1153    btrfs_release_path(root, path);
1154    return ret;
1155}
1156
1157static inline int extent_ref_type(u64 parent, u64 owner)
1158{
1159    int type;
1160    if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1161        if (parent > 0)
1162            type = BTRFS_SHARED_BLOCK_REF_KEY;
1163        else
1164            type = BTRFS_TREE_BLOCK_REF_KEY;
1165    } else {
1166        if (parent > 0)
1167            type = BTRFS_SHARED_DATA_REF_KEY;
1168        else
1169            type = BTRFS_EXTENT_DATA_REF_KEY;
1170    }
1171    return type;
1172}
1173
1174static int find_next_key(struct btrfs_path *path, int level,
1175             struct btrfs_key *key)
1176
1177{
1178    for (; level < BTRFS_MAX_LEVEL; level++) {
1179        if (!path->nodes[level])
1180            break;
1181        if (path->slots[level] + 1 >=
1182            btrfs_header_nritems(path->nodes[level]))
1183            continue;
1184        if (level == 0)
1185            btrfs_item_key_to_cpu(path->nodes[level], key,
1186                          path->slots[level] + 1);
1187        else
1188            btrfs_node_key_to_cpu(path->nodes[level], key,
1189                          path->slots[level] + 1);
1190        return 0;
1191    }
1192    return 1;
1193}
1194
1195/*
1196 * look for inline back ref. if back ref is found, *ref_ret is set
1197 * to the address of inline back ref, and 0 is returned.
1198 *
1199 * if back ref isn't found, *ref_ret is set to the address where it
1200 * should be inserted, and -ENOENT is returned.
1201 *
1202 * if insert is true and there are too many inline back refs, the path
1203 * points to the extent item, and -EAGAIN is returned.
1204 *
1205 * NOTE: inline back refs are ordered in the same way that back ref
1206 * items in the tree are ordered.
1207 */
1208static noinline_for_stack
1209int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1210                 struct btrfs_root *root,
1211                 struct btrfs_path *path,
1212                 struct btrfs_extent_inline_ref **ref_ret,
1213                 u64 bytenr, u64 num_bytes,
1214                 u64 parent, u64 root_objectid,
1215                 u64 owner, u64 offset, int insert)
1216{
1217    struct btrfs_key key;
1218    struct extent_buffer *leaf;
1219    struct btrfs_extent_item *ei;
1220    struct btrfs_extent_inline_ref *iref;
1221    u64 flags;
1222    u64 item_size;
1223    unsigned long ptr;
1224    unsigned long end;
1225    int extra_size;
1226    int type;
1227    int want;
1228    int ret;
1229    int err = 0;
1230
1231    key.objectid = bytenr;
1232    key.type = BTRFS_EXTENT_ITEM_KEY;
1233    key.offset = num_bytes;
1234
1235    want = extent_ref_type(parent, owner);
1236    if (insert) {
1237        extra_size = btrfs_extent_inline_ref_size(want);
1238        path->keep_locks = 1;
1239    } else
1240        extra_size = -1;
1241    ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1242    if (ret < 0) {
1243        err = ret;
1244        goto out;
1245    }
1246    BUG_ON(ret);
1247
1248    leaf = path->nodes[0];
1249    item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1250#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1251    if (item_size < sizeof(*ei)) {
1252        if (!insert) {
1253            err = -ENOENT;
1254            goto out;
1255        }
1256        ret = convert_extent_item_v0(trans, root, path, owner,
1257                         extra_size);
1258        if (ret < 0) {
1259            err = ret;
1260            goto out;
1261        }
1262        leaf = path->nodes[0];
1263        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1264    }
1265#endif
1266    BUG_ON(item_size < sizeof(*ei));
1267
1268    ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1269    flags = btrfs_extent_flags(leaf, ei);
1270
1271    ptr = (unsigned long)(ei + 1);
1272    end = (unsigned long)ei + item_size;
1273
1274    if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1275        ptr += sizeof(struct btrfs_tree_block_info);
1276        BUG_ON(ptr > end);
1277    } else {
1278        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1279    }
1280
1281    err = -ENOENT;
1282    while (1) {
1283        if (ptr >= end) {
1284            WARN_ON(ptr > end);
1285            break;
1286        }
1287        iref = (struct btrfs_extent_inline_ref *)ptr;
1288        type = btrfs_extent_inline_ref_type(leaf, iref);
1289        if (want < type)
1290            break;
1291        if (want > type) {
1292            ptr += btrfs_extent_inline_ref_size(type);
1293            continue;
1294        }
1295
1296        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1297            struct btrfs_extent_data_ref *dref;
1298            dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1299            if (match_extent_data_ref(leaf, dref, root_objectid,
1300                          owner, offset)) {
1301                err = 0;
1302                break;
1303            }
1304            if (hash_extent_data_ref_item(leaf, dref) <
1305                hash_extent_data_ref(root_objectid, owner, offset))
1306                break;
1307        } else {
1308            u64 ref_offset;
1309            ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1310            if (parent > 0) {
1311                if (parent == ref_offset) {
1312                    err = 0;
1313                    break;
1314                }
1315                if (ref_offset < parent)
1316                    break;
1317            } else {
1318                if (root_objectid == ref_offset) {
1319                    err = 0;
1320                    break;
1321                }
1322                if (ref_offset < root_objectid)
1323                    break;
1324            }
1325        }
1326        ptr += btrfs_extent_inline_ref_size(type);
1327    }
1328    if (err == -ENOENT && insert) {
1329        if (item_size + extra_size >=
1330            BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1331            err = -EAGAIN;
1332            goto out;
1333        }
1334        /*
1335         * To add new inline back ref, we have to make sure
1336         * there is no corresponding back ref item.
1337         * For simplicity, we just do not add new inline back
1338         * ref if there is any kind of item for this block
1339         */
1340        if (find_next_key(path, 0, &key) == 0 &&
1341            key.objectid == bytenr &&
1342            key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1343            err = -EAGAIN;
1344            goto out;
1345        }
1346    }
1347    *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1348out:
1349    if (insert) {
1350        path->keep_locks = 0;
1351        btrfs_unlock_up_safe(path, 1);
1352    }
1353    return err;
1354}
1355
1356/*
1357 * helper to add new inline back ref
1358 */
1359static noinline_for_stack
1360int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1361                struct btrfs_root *root,
1362                struct btrfs_path *path,
1363                struct btrfs_extent_inline_ref *iref,
1364                u64 parent, u64 root_objectid,
1365                u64 owner, u64 offset, int refs_to_add,
1366                struct btrfs_delayed_extent_op *extent_op)
1367{
1368    struct extent_buffer *leaf;
1369    struct btrfs_extent_item *ei;
1370    unsigned long ptr;
1371    unsigned long end;
1372    unsigned long item_offset;
1373    u64 refs;
1374    int size;
1375    int type;
1376    int ret;
1377
1378    leaf = path->nodes[0];
1379    ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1380    item_offset = (unsigned long)iref - (unsigned long)ei;
1381
1382    type = extent_ref_type(parent, owner);
1383    size = btrfs_extent_inline_ref_size(type);
1384
1385    ret = btrfs_extend_item(trans, root, path, size);
1386    BUG_ON(ret);
1387
1388    ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1389    refs = btrfs_extent_refs(leaf, ei);
1390    refs += refs_to_add;
1391    btrfs_set_extent_refs(leaf, ei, refs);
1392    if (extent_op)
1393        __run_delayed_extent_op(extent_op, leaf, ei);
1394
1395    ptr = (unsigned long)ei + item_offset;
1396    end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1397    if (ptr < end - size)
1398        memmove_extent_buffer(leaf, ptr + size, ptr,
1399                      end - size - ptr);
1400
1401    iref = (struct btrfs_extent_inline_ref *)ptr;
1402    btrfs_set_extent_inline_ref_type(leaf, iref, type);
1403    if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1404        struct btrfs_extent_data_ref *dref;
1405        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1406        btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1407        btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1408        btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1409        btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1410    } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1411        struct btrfs_shared_data_ref *sref;
1412        sref = (struct btrfs_shared_data_ref *)(iref + 1);
1413        btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1414        btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1415    } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1416        btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1417    } else {
1418        btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1419    }
1420    btrfs_mark_buffer_dirty(leaf);
1421    return 0;
1422}
1423
1424static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1425                 struct btrfs_root *root,
1426                 struct btrfs_path *path,
1427                 struct btrfs_extent_inline_ref **ref_ret,
1428                 u64 bytenr, u64 num_bytes, u64 parent,
1429                 u64 root_objectid, u64 owner, u64 offset)
1430{
1431    int ret;
1432
1433    ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1434                       bytenr, num_bytes, parent,
1435                       root_objectid, owner, offset, 0);
1436    if (ret != -ENOENT)
1437        return ret;
1438
1439    btrfs_release_path(root, path);
1440    *ref_ret = NULL;
1441
1442    if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1443        ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1444                        root_objectid);
1445    } else {
1446        ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1447                         root_objectid, owner, offset);
1448    }
1449    return ret;
1450}
1451
1452/*
1453 * helper to update/remove inline back ref
1454 */
1455static noinline_for_stack
1456int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1457                 struct btrfs_root *root,
1458                 struct btrfs_path *path,
1459                 struct btrfs_extent_inline_ref *iref,
1460                 int refs_to_mod,
1461                 struct btrfs_delayed_extent_op *extent_op)
1462{
1463    struct extent_buffer *leaf;
1464    struct btrfs_extent_item *ei;
1465    struct btrfs_extent_data_ref *dref = NULL;
1466    struct btrfs_shared_data_ref *sref = NULL;
1467    unsigned long ptr;
1468    unsigned long end;
1469    u32 item_size;
1470    int size;
1471    int type;
1472    int ret;
1473    u64 refs;
1474
1475    leaf = path->nodes[0];
1476    ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1477    refs = btrfs_extent_refs(leaf, ei);
1478    WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1479    refs += refs_to_mod;
1480    btrfs_set_extent_refs(leaf, ei, refs);
1481    if (extent_op)
1482        __run_delayed_extent_op(extent_op, leaf, ei);
1483
1484    type = btrfs_extent_inline_ref_type(leaf, iref);
1485
1486    if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1487        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1488        refs = btrfs_extent_data_ref_count(leaf, dref);
1489    } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1490        sref = (struct btrfs_shared_data_ref *)(iref + 1);
1491        refs = btrfs_shared_data_ref_count(leaf, sref);
1492    } else {
1493        refs = 1;
1494        BUG_ON(refs_to_mod != -1);
1495    }
1496
1497    BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1498    refs += refs_to_mod;
1499
1500    if (refs > 0) {
1501        if (type == BTRFS_EXTENT_DATA_REF_KEY)
1502            btrfs_set_extent_data_ref_count(leaf, dref, refs);
1503        else
1504            btrfs_set_shared_data_ref_count(leaf, sref, refs);
1505    } else {
1506        size = btrfs_extent_inline_ref_size(type);
1507        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1508        ptr = (unsigned long)iref;
1509        end = (unsigned long)ei + item_size;
1510        if (ptr + size < end)
1511            memmove_extent_buffer(leaf, ptr, ptr + size,
1512                          end - ptr - size);
1513        item_size -= size;
1514        ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1515        BUG_ON(ret);
1516    }
1517    btrfs_mark_buffer_dirty(leaf);
1518    return 0;
1519}
1520
1521static noinline_for_stack
1522int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1523                 struct btrfs_root *root,
1524                 struct btrfs_path *path,
1525                 u64 bytenr, u64 num_bytes, u64 parent,
1526                 u64 root_objectid, u64 owner,
1527                 u64 offset, int refs_to_add,
1528                 struct btrfs_delayed_extent_op *extent_op)
1529{
1530    struct btrfs_extent_inline_ref *iref;
1531    int ret;
1532
1533    ret = lookup_inline_extent_backref(trans, root, path, &iref,
1534                       bytenr, num_bytes, parent,
1535                       root_objectid, owner, offset, 1);
1536    if (ret == 0) {
1537        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1538        ret = update_inline_extent_backref(trans, root, path, iref,
1539                           refs_to_add, extent_op);
1540    } else if (ret == -ENOENT) {
1541        ret = setup_inline_extent_backref(trans, root, path, iref,
1542                          parent, root_objectid,
1543                          owner, offset, refs_to_add,
1544                          extent_op);
1545    }
1546    return ret;
1547}
1548
1549static int insert_extent_backref(struct btrfs_trans_handle *trans,
1550                 struct btrfs_root *root,
1551                 struct btrfs_path *path,
1552                 u64 bytenr, u64 parent, u64 root_objectid,
1553                 u64 owner, u64 offset, int refs_to_add)
1554{
1555    int ret;
1556    if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1557        BUG_ON(refs_to_add != 1);
1558        ret = insert_tree_block_ref(trans, root, path, bytenr,
1559                        parent, root_objectid);
1560    } else {
1561        ret = insert_extent_data_ref(trans, root, path, bytenr,
1562                         parent, root_objectid,
1563                         owner, offset, refs_to_add);
1564    }
1565    return ret;
1566}
1567
1568static int remove_extent_backref(struct btrfs_trans_handle *trans,
1569                 struct btrfs_root *root,
1570                 struct btrfs_path *path,
1571                 struct btrfs_extent_inline_ref *iref,
1572                 int refs_to_drop, int is_data)
1573{
1574    int ret;
1575
1576    BUG_ON(!is_data && refs_to_drop != 1);
1577    if (iref) {
1578        ret = update_inline_extent_backref(trans, root, path, iref,
1579                           -refs_to_drop, NULL);
1580    } else if (is_data) {
1581        ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1582    } else {
1583        ret = btrfs_del_item(trans, root, path);
1584    }
1585    return ret;
1586}
1587
1588static void btrfs_issue_discard(struct block_device *bdev,
1589                u64 start, u64 len)
1590{
1591    blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1592                 DISCARD_FL_BARRIER);
1593}
1594
1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1596                u64 num_bytes)
1597{
1598    int ret;
1599    u64 map_length = num_bytes;
1600    struct btrfs_multi_bio *multi = NULL;
1601
1602    if (!btrfs_test_opt(root, DISCARD))
1603        return 0;
1604
1605    /* Tell the block device(s) that the sectors can be discarded */
1606    ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1607                  bytenr, &map_length, &multi, 0);
1608    if (!ret) {
1609        struct btrfs_bio_stripe *stripe = multi->stripes;
1610        int i;
1611
1612        if (map_length > num_bytes)
1613            map_length = num_bytes;
1614
1615        for (i = 0; i < multi->num_stripes; i++, stripe++) {
1616            btrfs_issue_discard(stripe->dev->bdev,
1617                        stripe->physical,
1618                        map_length);
1619        }
1620        kfree(multi);
1621    }
1622
1623    return ret;
1624}
1625
1626int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1627             struct btrfs_root *root,
1628             u64 bytenr, u64 num_bytes, u64 parent,
1629             u64 root_objectid, u64 owner, u64 offset)
1630{
1631    int ret;
1632    BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1633           root_objectid == BTRFS_TREE_LOG_OBJECTID);
1634
1635    if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1636        ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1637                    parent, root_objectid, (int)owner,
1638                    BTRFS_ADD_DELAYED_REF, NULL);
1639    } else {
1640        ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1641                    parent, root_objectid, owner, offset,
1642                    BTRFS_ADD_DELAYED_REF, NULL);
1643    }
1644    return ret;
1645}
1646
1647static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1648                  struct btrfs_root *root,
1649                  u64 bytenr, u64 num_bytes,
1650                  u64 parent, u64 root_objectid,
1651                  u64 owner, u64 offset, int refs_to_add,
1652                  struct btrfs_delayed_extent_op *extent_op)
1653{
1654    struct btrfs_path *path;
1655    struct extent_buffer *leaf;
1656    struct btrfs_extent_item *item;
1657    u64 refs;
1658    int ret;
1659    int err = 0;
1660
1661    path = btrfs_alloc_path();
1662    if (!path)
1663        return -ENOMEM;
1664
1665    path->reada = 1;
1666    path->leave_spinning = 1;
1667    /* this will setup the path even if it fails to insert the back ref */
1668    ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1669                       path, bytenr, num_bytes, parent,
1670                       root_objectid, owner, offset,
1671                       refs_to_add, extent_op);
1672    if (ret == 0)
1673        goto out;
1674
1675    if (ret != -EAGAIN) {
1676        err = ret;
1677        goto out;
1678    }
1679
1680    leaf = path->nodes[0];
1681    item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1682    refs = btrfs_extent_refs(leaf, item);
1683    btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1684    if (extent_op)
1685        __run_delayed_extent_op(extent_op, leaf, item);
1686
1687    btrfs_mark_buffer_dirty(leaf);
1688    btrfs_release_path(root->fs_info->extent_root, path);
1689
1690    path->reada = 1;
1691    path->leave_spinning = 1;
1692
1693    /* now insert the actual backref */
1694    ret = insert_extent_backref(trans, root->fs_info->extent_root,
1695                    path, bytenr, parent, root_objectid,
1696                    owner, offset, refs_to_add);
1697    BUG_ON(ret);
1698out:
1699    btrfs_free_path(path);
1700    return err;
1701}
1702
1703static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1704                struct btrfs_root *root,
1705                struct btrfs_delayed_ref_node *node,
1706                struct btrfs_delayed_extent_op *extent_op,
1707                int insert_reserved)
1708{
1709    int ret = 0;
1710    struct btrfs_delayed_data_ref *ref;
1711    struct btrfs_key ins;
1712    u64 parent = 0;
1713    u64 ref_root = 0;
1714    u64 flags = 0;
1715
1716    ins.objectid = node->bytenr;
1717    ins.offset = node->num_bytes;
1718    ins.type = BTRFS_EXTENT_ITEM_KEY;
1719
1720    ref = btrfs_delayed_node_to_data_ref(node);
1721    if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1722        parent = ref->parent;
1723    else
1724        ref_root = ref->root;
1725
1726    if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1727        if (extent_op) {
1728            BUG_ON(extent_op->update_key);
1729            flags |= extent_op->flags_to_set;
1730        }
1731        ret = alloc_reserved_file_extent(trans, root,
1732                         parent, ref_root, flags,
1733                         ref->objectid, ref->offset,
1734                         &ins, node->ref_mod);
1735    } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1736        ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1737                         node->num_bytes, parent,
1738                         ref_root, ref->objectid,
1739                         ref->offset, node->ref_mod,
1740                         extent_op);
1741    } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1742        ret = __btrfs_free_extent(trans, root, node->bytenr,
1743                      node->num_bytes, parent,
1744                      ref_root, ref->objectid,
1745                      ref->offset, node->ref_mod,
1746                      extent_op);
1747    } else {
1748        BUG();
1749    }
1750    return ret;
1751}
1752
1753static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1754                    struct extent_buffer *leaf,
1755                    struct btrfs_extent_item *ei)
1756{
1757    u64 flags = btrfs_extent_flags(leaf, ei);
1758    if (extent_op->update_flags) {
1759        flags |= extent_op->flags_to_set;
1760        btrfs_set_extent_flags(leaf, ei, flags);
1761    }
1762
1763    if (extent_op->update_key) {
1764        struct btrfs_tree_block_info *bi;
1765        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1766        bi = (struct btrfs_tree_block_info *)(ei + 1);
1767        btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1768    }
1769}
1770
1771static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1772                 struct btrfs_root *root,
1773                 struct btrfs_delayed_ref_node *node,
1774                 struct btrfs_delayed_extent_op *extent_op)
1775{
1776    struct btrfs_key key;
1777    struct btrfs_path *path;
1778    struct btrfs_extent_item *ei;
1779    struct extent_buffer *leaf;
1780    u32 item_size;
1781    int ret;
1782    int err = 0;
1783
1784    path = btrfs_alloc_path();
1785    if (!path)
1786        return -ENOMEM;
1787
1788    key.objectid = node->bytenr;
1789    key.type = BTRFS_EXTENT_ITEM_KEY;
1790    key.offset = node->num_bytes;
1791
1792    path->reada = 1;
1793    path->leave_spinning = 1;
1794    ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1795                path, 0, 1);
1796    if (ret < 0) {
1797        err = ret;
1798        goto out;
1799    }
1800    if (ret > 0) {
1801        err = -EIO;
1802        goto out;
1803    }
1804
1805    leaf = path->nodes[0];
1806    item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1807#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1808    if (item_size < sizeof(*ei)) {
1809        ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1810                         path, (u64)-1, 0);
1811        if (ret < 0) {
1812            err = ret;
1813            goto out;
1814        }
1815        leaf = path->nodes[0];
1816        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1817    }
1818#endif
1819    BUG_ON(item_size < sizeof(*ei));
1820    ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1821    __run_delayed_extent_op(extent_op, leaf, ei);
1822
1823    btrfs_mark_buffer_dirty(leaf);
1824out:
1825    btrfs_free_path(path);
1826    return err;
1827}
1828
1829static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1830                struct btrfs_root *root,
1831                struct btrfs_delayed_ref_node *node,
1832                struct btrfs_delayed_extent_op *extent_op,
1833                int insert_reserved)
1834{
1835    int ret = 0;
1836    struct btrfs_delayed_tree_ref *ref;
1837    struct btrfs_key ins;
1838    u64 parent = 0;
1839    u64 ref_root = 0;
1840
1841    ins.objectid = node->bytenr;
1842    ins.offset = node->num_bytes;
1843    ins.type = BTRFS_EXTENT_ITEM_KEY;
1844
1845    ref = btrfs_delayed_node_to_tree_ref(node);
1846    if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1847        parent = ref->parent;
1848    else
1849        ref_root = ref->root;
1850
1851    BUG_ON(node->ref_mod != 1);
1852    if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1853        BUG_ON(!extent_op || !extent_op->update_flags ||
1854               !extent_op->update_key);
1855        ret = alloc_reserved_tree_block(trans, root,
1856                        parent, ref_root,
1857                        extent_op->flags_to_set,
1858                        &extent_op->key,
1859                        ref->level, &ins);
1860    } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1861        ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1862                         node->num_bytes, parent, ref_root,
1863                         ref->level, 0, 1, extent_op);
1864    } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1865        ret = __btrfs_free_extent(trans, root, node->bytenr,
1866                      node->num_bytes, parent, ref_root,
1867                      ref->level, 0, 1, extent_op);
1868    } else {
1869        BUG();
1870    }
1871    return ret;
1872}
1873
1874
1875/* helper function to actually process a single delayed ref entry */
1876static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1877                   struct btrfs_root *root,
1878                   struct btrfs_delayed_ref_node *node,
1879                   struct btrfs_delayed_extent_op *extent_op,
1880                   int insert_reserved)
1881{
1882    int ret;
1883    if (btrfs_delayed_ref_is_head(node)) {
1884        struct btrfs_delayed_ref_head *head;
1885        /*
1886         * we've hit the end of the chain and we were supposed
1887         * to insert this extent into the tree. But, it got
1888         * deleted before we ever needed to insert it, so all
1889         * we have to do is clean up the accounting
1890         */
1891        BUG_ON(extent_op);
1892        head = btrfs_delayed_node_to_head(node);
1893        if (insert_reserved) {
1894            int mark_free = 0;
1895            struct extent_buffer *must_clean = NULL;
1896
1897            ret = pin_down_bytes(trans, root, NULL,
1898                         node->bytenr, node->num_bytes,
1899                         head->is_data, 1, &must_clean);
1900            if (ret > 0)
1901                mark_free = 1;
1902
1903            if (must_clean) {
1904                clean_tree_block(NULL, root, must_clean);
1905                btrfs_tree_unlock(must_clean);
1906                free_extent_buffer(must_clean);
1907            }
1908            if (head->is_data) {
1909                ret = btrfs_del_csums(trans, root,
1910                              node->bytenr,
1911                              node->num_bytes);
1912                BUG_ON(ret);
1913            }
1914            if (mark_free) {
1915                ret = btrfs_free_reserved_extent(root,
1916                            node->bytenr,
1917                            node->num_bytes);
1918                BUG_ON(ret);
1919            }
1920        }
1921        mutex_unlock(&head->mutex);
1922        return 0;
1923    }
1924
1925    if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1926        node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1927        ret = run_delayed_tree_ref(trans, root, node, extent_op,
1928                       insert_reserved);
1929    else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1930         node->type == BTRFS_SHARED_DATA_REF_KEY)
1931        ret = run_delayed_data_ref(trans, root, node, extent_op,
1932                       insert_reserved);
1933    else
1934        BUG();
1935    return ret;
1936}
1937
1938static noinline struct btrfs_delayed_ref_node *
1939select_delayed_ref(struct btrfs_delayed_ref_head *head)
1940{
1941    struct rb_node *node;
1942    struct btrfs_delayed_ref_node *ref;
1943    int action = BTRFS_ADD_DELAYED_REF;
1944again:
1945    /*
1946     * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
1947     * this prevents ref count from going down to zero when
1948     * there still are pending delayed ref.
1949     */
1950    node = rb_prev(&head->node.rb_node);
1951    while (1) {
1952        if (!node)
1953            break;
1954        ref = rb_entry(node, struct btrfs_delayed_ref_node,
1955                rb_node);
1956        if (ref->bytenr != head->node.bytenr)
1957            break;
1958        if (ref->action == action)
1959            return ref;
1960        node = rb_prev(node);
1961    }
1962    if (action == BTRFS_ADD_DELAYED_REF) {
1963        action = BTRFS_DROP_DELAYED_REF;
1964        goto again;
1965    }
1966    return NULL;
1967}
1968
1969static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
1970                       struct btrfs_root *root,
1971                       struct list_head *cluster)
1972{
1973    struct btrfs_delayed_ref_root *delayed_refs;
1974    struct btrfs_delayed_ref_node *ref;
1975    struct btrfs_delayed_ref_head *locked_ref = NULL;
1976    struct btrfs_delayed_extent_op *extent_op;
1977    int ret;
1978    int count = 0;
1979    int must_insert_reserved = 0;
1980
1981    delayed_refs = &trans->transaction->delayed_refs;
1982    while (1) {
1983        if (!locked_ref) {
1984            /* pick a new head ref from the cluster list */
1985            if (list_empty(cluster))
1986                break;
1987
1988            locked_ref = list_entry(cluster->next,
1989                     struct btrfs_delayed_ref_head, cluster);
1990
1991            /* grab the lock that says we are going to process
1992             * all the refs for this head */
1993            ret = btrfs_delayed_ref_lock(trans, locked_ref);
1994
1995            /*
1996             * we may have dropped the spin lock to get the head
1997             * mutex lock, and that might have given someone else
1998             * time to free the head. If that's true, it has been
1999             * removed from our list and we can move on.
2000             */
2001            if (ret == -EAGAIN) {
2002                locked_ref = NULL;
2003                count++;
2004                continue;
2005            }
2006        }
2007
2008        /*
2009         * record the must insert reserved flag before we
2010         * drop the spin lock.
2011         */
2012        must_insert_reserved = locked_ref->must_insert_reserved;
2013        locked_ref->must_insert_reserved = 0;
2014
2015        extent_op = locked_ref->extent_op;
2016        locked_ref->extent_op = NULL;
2017
2018        /*
2019         * locked_ref is the head node, so we have to go one
2020         * node back for any delayed ref updates
2021         */
2022        ref = select_delayed_ref(locked_ref);
2023        if (!ref) {
2024            /* All delayed refs have been processed, Go ahead
2025             * and send the head node to run_one_delayed_ref,
2026             * so that any accounting fixes can happen
2027             */
2028            ref = &locked_ref->node;
2029
2030            if (extent_op && must_insert_reserved) {
2031                kfree(extent_op);
2032                extent_op = NULL;
2033            }
2034
2035            if (extent_op) {
2036                spin_unlock(&delayed_refs->lock);
2037
2038                ret = run_delayed_extent_op(trans, root,
2039                                ref, extent_op);
2040                BUG_ON(ret);
2041                kfree(extent_op);
2042
2043                cond_resched();
2044                spin_lock(&delayed_refs->lock);
2045                continue;
2046            }
2047
2048            list_del_init(&locked_ref->cluster);
2049            locked_ref = NULL;
2050        }
2051
2052        ref->in_tree = 0;
2053        rb_erase(&ref->rb_node, &delayed_refs->root);
2054        delayed_refs->num_entries--;
2055
2056        spin_unlock(&delayed_refs->lock);
2057
2058        ret = run_one_delayed_ref(trans, root, ref, extent_op,
2059                      must_insert_reserved);
2060        BUG_ON(ret);
2061
2062        btrfs_put_delayed_ref(ref);
2063        kfree(extent_op);
2064        count++;
2065
2066        cond_resched();
2067        spin_lock(&delayed_refs->lock);
2068    }
2069    return count;
2070}
2071
2072/*
2073 * this starts processing the delayed reference count updates and
2074 * extent insertions we have queued up so far. count can be
2075 * 0, which means to process everything in the tree at the start
2076 * of the run (but not newly added entries), or it can be some target
2077 * number you'd like to process.
2078 */
2079int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2080               struct btrfs_root *root, unsigned long count)
2081{
2082    struct rb_node *node;
2083    struct btrfs_delayed_ref_root *delayed_refs;
2084    struct btrfs_delayed_ref_node *ref;
2085    struct list_head cluster;
2086    int ret;
2087    int run_all = count == (unsigned long)-1;
2088    int run_most = 0;
2089
2090    if (root == root->fs_info->extent_root)
2091        root = root->fs_info->tree_root;
2092
2093    delayed_refs = &trans->transaction->delayed_refs;
2094    INIT_LIST_HEAD(&cluster);
2095again:
2096    spin_lock(&delayed_refs->lock);
2097    if (count == 0) {
2098        count = delayed_refs->num_entries * 2;
2099        run_most = 1;
2100    }
2101    while (1) {
2102        if (!(run_all || run_most) &&
2103            delayed_refs->num_heads_ready < 64)
2104            break;
2105
2106        /*
2107         * go find something we can process in the rbtree. We start at
2108         * the beginning of the tree, and then build a cluster
2109         * of refs to process starting at the first one we are able to
2110         * lock
2111         */
2112        ret = btrfs_find_ref_cluster(trans, &cluster,
2113                         delayed_refs->run_delayed_start);
2114        if (ret)
2115            break;
2116
2117        ret = run_clustered_refs(trans, root, &cluster);
2118        BUG_ON(ret < 0);
2119
2120        count -= min_t(unsigned long, ret, count);
2121
2122        if (count == 0)
2123            break;
2124    }
2125
2126    if (run_all) {
2127        node = rb_first(&delayed_refs->root);
2128        if (!node)
2129            goto out;
2130        count = (unsigned long)-1;
2131
2132        while (node) {
2133            ref = rb_entry(node, struct btrfs_delayed_ref_node,
2134                       rb_node);
2135            if (btrfs_delayed_ref_is_head(ref)) {
2136                struct btrfs_delayed_ref_head *head;
2137
2138                head = btrfs_delayed_node_to_head(ref);
2139                atomic_inc(&ref->refs);
2140
2141                spin_unlock(&delayed_refs->lock);
2142                mutex_lock(&head->mutex);
2143                mutex_unlock(&head->mutex);
2144
2145                btrfs_put_delayed_ref(ref);
2146                cond_resched();
2147                goto again;
2148            }
2149            node = rb_next(node);
2150        }
2151        spin_unlock(&delayed_refs->lock);
2152        schedule_timeout(1);
2153        goto again;
2154    }
2155out:
2156    spin_unlock(&delayed_refs->lock);
2157    return 0;
2158}
2159
2160int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2161                struct btrfs_root *root,
2162                u64 bytenr, u64 num_bytes, u64 flags,
2163                int is_data)
2164{
2165    struct btrfs_delayed_extent_op *extent_op;
2166    int ret;
2167
2168    extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2169    if (!extent_op)
2170        return -ENOMEM;
2171
2172    extent_op->flags_to_set = flags;
2173    extent_op->update_flags = 1;
2174    extent_op->update_key = 0;
2175    extent_op->is_data = is_data ? 1 : 0;
2176
2177    ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2178    if (ret)
2179        kfree(extent_op);
2180    return ret;
2181}
2182
2183static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2184                      struct btrfs_root *root,
2185                      struct btrfs_path *path,
2186                      u64 objectid, u64 offset, u64 bytenr)
2187{
2188    struct btrfs_delayed_ref_head *head;
2189    struct btrfs_delayed_ref_node *ref;
2190    struct btrfs_delayed_data_ref *data_ref;
2191    struct btrfs_delayed_ref_root *delayed_refs;
2192    struct rb_node *node;
2193    int ret = 0;
2194
2195    ret = -ENOENT;
2196    delayed_refs = &trans->transaction->delayed_refs;
2197    spin_lock(&delayed_refs->lock);
2198    head = btrfs_find_delayed_ref_head(trans, bytenr);
2199    if (!head)
2200        goto out;
2201
2202    if (!mutex_trylock(&head->mutex)) {
2203        atomic_inc(&head->node.refs);
2204        spin_unlock(&delayed_refs->lock);
2205
2206        btrfs_release_path(root->fs_info->extent_root, path);
2207
2208        mutex_lock(&head->mutex);
2209        mutex_unlock(&head->mutex);
2210        btrfs_put_delayed_ref(&head->node);
2211        return -EAGAIN;
2212    }
2213
2214    node = rb_prev(&head->node.rb_node);
2215    if (!node)
2216        goto out_unlock;
2217
2218    ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2219
2220    if (ref->bytenr != bytenr)
2221        goto out_unlock;
2222
2223    ret = 1;
2224    if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2225        goto out_unlock;
2226
2227    data_ref = btrfs_delayed_node_to_data_ref(ref);
2228
2229    node = rb_prev(node);
2230    if (node) {
2231        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2232        if (ref->bytenr == bytenr)
2233            goto out_unlock;
2234    }
2235
2236    if (data_ref->root != root->root_key.objectid ||
2237        data_ref->objectid != objectid || data_ref->offset != offset)
2238        goto out_unlock;
2239
2240    ret = 0;
2241out_unlock:
2242    mutex_unlock(&head->mutex);
2243out:
2244    spin_unlock(&delayed_refs->lock);
2245    return ret;
2246}
2247
2248static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2249                    struct btrfs_root *root,
2250                    struct btrfs_path *path,
2251                    u64 objectid, u64 offset, u64 bytenr)
2252{
2253    struct btrfs_root *extent_root = root->fs_info->extent_root;
2254    struct extent_buffer *leaf;
2255    struct btrfs_extent_data_ref *ref;
2256    struct btrfs_extent_inline_ref *iref;
2257    struct btrfs_extent_item *ei;
2258    struct btrfs_key key;
2259    u32 item_size;
2260    int ret;
2261
2262    key.objectid = bytenr;
2263    key.offset = (u64)-1;
2264    key.type = BTRFS_EXTENT_ITEM_KEY;
2265
2266    ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2267    if (ret < 0)
2268        goto out;
2269    BUG_ON(ret == 0);
2270
2271    ret = -ENOENT;
2272    if (path->slots[0] == 0)
2273        goto out;
2274
2275    path->slots[0]--;
2276    leaf = path->nodes[0];
2277    btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2278
2279    if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2280        goto out;
2281
2282    ret = 1;
2283    item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2284#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2285    if (item_size < sizeof(*ei)) {
2286        WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2287        goto out;
2288    }
2289#endif
2290    ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2291
2292    if (item_size != sizeof(*ei) +
2293        btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2294        goto out;
2295
2296    if (btrfs_extent_generation(leaf, ei) <=
2297        btrfs_root_last_snapshot(&root->root_item))
2298        goto out;
2299
2300    iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2301    if (btrfs_extent_inline_ref_type(leaf, iref) !=
2302        BTRFS_EXTENT_DATA_REF_KEY)
2303        goto out;
2304
2305    ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2306    if (btrfs_extent_refs(leaf, ei) !=
2307        btrfs_extent_data_ref_count(leaf, ref) ||
2308        btrfs_extent_data_ref_root(leaf, ref) !=
2309        root->root_key.objectid ||
2310        btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2311        btrfs_extent_data_ref_offset(leaf, ref) != offset)
2312        goto out;
2313
2314    ret = 0;
2315out:
2316    return ret;
2317}
2318
2319int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2320              struct btrfs_root *root,
2321              u64 objectid, u64 offset, u64 bytenr)
2322{
2323    struct btrfs_path *path;
2324    int ret;
2325    int ret2;
2326
2327    path = btrfs_alloc_path();
2328    if (!path)
2329        return -ENOENT;
2330
2331    do {
2332        ret = check_committed_ref(trans, root, path, objectid,
2333                      offset, bytenr);
2334        if (ret && ret != -ENOENT)
2335            goto out;
2336
2337        ret2 = check_delayed_ref(trans, root, path, objectid,
2338                     offset, bytenr);
2339    } while (ret2 == -EAGAIN);
2340
2341    if (ret2 && ret2 != -ENOENT) {
2342        ret = ret2;
2343        goto out;
2344    }
2345
2346    if (ret != -ENOENT || ret2 != -ENOENT)
2347        ret = 0;
2348out:
2349    btrfs_free_path(path);
2350    return ret;
2351}
2352
2353#if 0
2354int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2355            struct extent_buffer *buf, u32 nr_extents)
2356{
2357    struct btrfs_key key;
2358    struct btrfs_file_extent_item *fi;
2359    u64 root_gen;
2360    u32 nritems;
2361    int i;
2362    int level;
2363    int ret = 0;
2364    int shared = 0;
2365
2366    if (!root->ref_cows)
2367        return 0;
2368
2369    if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2370        shared = 0;
2371        root_gen = root->root_key.offset;
2372    } else {
2373        shared = 1;
2374        root_gen = trans->transid - 1;
2375    }
2376
2377    level = btrfs_header_level(buf);
2378    nritems = btrfs_header_nritems(buf);
2379
2380    if (level == 0) {
2381        struct btrfs_leaf_ref *ref;
2382        struct btrfs_extent_info *info;
2383
2384        ref = btrfs_alloc_leaf_ref(root, nr_extents);
2385        if (!ref) {
2386            ret = -ENOMEM;
2387            goto out;
2388        }
2389
2390        ref->root_gen = root_gen;
2391        ref->bytenr = buf->start;
2392        ref->owner = btrfs_header_owner(buf);
2393        ref->generation = btrfs_header_generation(buf);
2394        ref->nritems = nr_extents;
2395        info = ref->extents;
2396
2397        for (i = 0; nr_extents > 0 && i < nritems; i++) {
2398            u64 disk_bytenr;
2399            btrfs_item_key_to_cpu(buf, &key, i);
2400            if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2401                continue;
2402            fi = btrfs_item_ptr(buf, i,
2403                        struct btrfs_file_extent_item);
2404            if (btrfs_file_extent_type(buf, fi) ==
2405                BTRFS_FILE_EXTENT_INLINE)
2406                continue;
2407            disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2408            if (disk_bytenr == 0)
2409                continue;
2410
2411            info->bytenr = disk_bytenr;
2412            info->num_bytes =
2413                btrfs_file_extent_disk_num_bytes(buf, fi);
2414            info->objectid = key.objectid;
2415            info->offset = key.offset;
2416            info++;
2417        }
2418
2419        ret = btrfs_add_leaf_ref(root, ref, shared);
2420        if (ret == -EEXIST && shared) {
2421            struct btrfs_leaf_ref *old;
2422            old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2423            BUG_ON(!old);
2424            btrfs_remove_leaf_ref(root, old);
2425            btrfs_free_leaf_ref(root, old);
2426            ret = btrfs_add_leaf_ref(root, ref, shared);
2427        }
2428        WARN_ON(ret);
2429        btrfs_free_leaf_ref(root, ref);
2430    }
2431out:
2432    return ret;
2433}
2434
2435/* when a block goes through cow, we update the reference counts of
2436 * everything that block points to. The internal pointers of the block
2437 * can be in just about any order, and it is likely to have clusters of
2438 * things that are close together and clusters of things that are not.
2439 *
2440 * To help reduce the seeks that come with updating all of these reference
2441 * counts, sort them by byte number before actual updates are done.
2442 *
2443 * struct refsort is used to match byte number to slot in the btree block.
2444 * we sort based on the byte number and then use the slot to actually
2445 * find the item.
2446 *
2447 * struct refsort is smaller than strcut btrfs_item and smaller than
2448 * struct btrfs_key_ptr. Since we're currently limited to the page size
2449 * for a btree block, there's no way for a kmalloc of refsorts for a
2450 * single node to be bigger than a page.
2451 */
2452struct refsort {
2453    u64 bytenr;
2454    u32 slot;
2455};
2456
2457/*
2458 * for passing into sort()
2459 */
2460static int refsort_cmp(const void *a_void, const void *b_void)
2461{
2462    const struct refsort *a = a_void;
2463    const struct refsort *b = b_void;
2464
2465    if (a->bytenr < b->bytenr)
2466        return -1;
2467    if (a->bytenr > b->bytenr)
2468        return 1;
2469    return 0;
2470}
2471#endif
2472
2473static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2474               struct btrfs_root *root,
2475               struct extent_buffer *buf,
2476               int full_backref, int inc)
2477{
2478    u64 bytenr;
2479    u64 num_bytes;
2480    u64 parent;
2481    u64 ref_root;
2482    u32 nritems;
2483    struct btrfs_key key;
2484    struct btrfs_file_extent_item *fi;
2485    int i;
2486    int level;
2487    int ret = 0;
2488    int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2489                u64, u64, u64, u64, u64, u64);
2490
2491    ref_root = btrfs_header_owner(buf);
2492    nritems = btrfs_header_nritems(buf);
2493    level = btrfs_header_level(buf);
2494
2495    if (!root->ref_cows && level == 0)
2496        return 0;
2497
2498    if (inc)
2499        process_func = btrfs_inc_extent_ref;
2500    else
2501        process_func = btrfs_free_extent;
2502
2503    if (full_backref)
2504        parent = buf->start;
2505    else
2506        parent = 0;
2507
2508    for (i = 0; i < nritems; i++) {
2509        if (level == 0) {
2510            btrfs_item_key_to_cpu(buf, &key, i);
2511            if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2512                continue;
2513            fi = btrfs_item_ptr(buf, i,
2514                        struct btrfs_file_extent_item);
2515            if (btrfs_file_extent_type(buf, fi) ==
2516                BTRFS_FILE_EXTENT_INLINE)
2517                continue;
2518            bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2519            if (bytenr == 0)
2520                continue;
2521
2522            num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2523            key.offset -= btrfs_file_extent_offset(buf, fi);
2524            ret = process_func(trans, root, bytenr, num_bytes,
2525                       parent, ref_root, key.objectid,
2526                       key.offset);
2527            if (ret)
2528                goto fail;
2529        } else {
2530            bytenr = btrfs_node_blockptr(buf, i);
2531            num_bytes = btrfs_level_size(root, level - 1);
2532            ret = process_func(trans, root, bytenr, num_bytes,
2533                       parent, ref_root, level - 1, 0);
2534            if (ret)
2535                goto fail;
2536        }
2537    }
2538    return 0;
2539fail:
2540    BUG();
2541    return ret;
2542}
2543
2544int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2545          struct extent_buffer *buf, int full_backref)
2546{
2547    return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2548}
2549
2550int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2551          struct extent_buffer *buf, int full_backref)
2552{
2553    return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2554}
2555
2556static int write_one_cache_group(struct btrfs_trans_handle *trans,
2557                 struct btrfs_root *root,
2558                 struct btrfs_path *path,
2559                 struct btrfs_block_group_cache *cache)
2560{
2561    int ret;
2562    struct btrfs_root *extent_root = root->fs_info->extent_root;
2563    unsigned long bi;
2564    struct extent_buffer *leaf;
2565
2566    ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2567    if (ret < 0)
2568        goto fail;
2569    BUG_ON(ret);
2570
2571    leaf = path->nodes[0];
2572    bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2573    write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2574    btrfs_mark_buffer_dirty(leaf);
2575    btrfs_release_path(extent_root, path);
2576fail:
2577    if (ret)
2578        return ret;
2579    return 0;
2580
2581}
2582
2583static struct btrfs_block_group_cache *
2584next_block_group(struct btrfs_root *root,
2585         struct btrfs_block_group_cache *cache)
2586{
2587    struct rb_node *node;
2588    spin_lock(&root->fs_info->block_group_cache_lock);
2589    node = rb_next(&cache->cache_node);
2590    btrfs_put_block_group(cache);
2591    if (node) {
2592        cache = rb_entry(node, struct btrfs_block_group_cache,
2593                 cache_node);
2594        btrfs_get_block_group(cache);
2595    } else
2596        cache = NULL;
2597    spin_unlock(&root->fs_info->block_group_cache_lock);
2598    return cache;
2599}
2600
2601int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2602                   struct btrfs_root *root)
2603{
2604    struct btrfs_block_group_cache *cache;
2605    int err = 0;
2606    struct btrfs_path *path;
2607    u64 last = 0;
2608
2609    path = btrfs_alloc_path();
2610    if (!path)
2611        return -ENOMEM;
2612
2613    while (1) {
2614        if (last == 0) {
2615            err = btrfs_run_delayed_refs(trans, root,
2616                             (unsigned long)-1);
2617            BUG_ON(err);
2618        }
2619
2620        cache = btrfs_lookup_first_block_group(root->fs_info, last);
2621        while (cache) {
2622            if (cache->dirty)
2623                break;
2624            cache = next_block_group(root, cache);
2625        }
2626        if (!cache) {
2627            if (last == 0)
2628                break;
2629            last = 0;
2630            continue;
2631        }
2632
2633        cache->dirty = 0;
2634        last = cache->key.objectid + cache->key.offset;
2635
2636        err = write_one_cache_group(trans, root, path, cache);
2637        BUG_ON(err);
2638        btrfs_put_block_group(cache);
2639    }
2640
2641    btrfs_free_path(path);
2642    return 0;
2643}
2644
2645int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2646{
2647    struct btrfs_block_group_cache *block_group;
2648    int readonly = 0;
2649
2650    block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2651    if (!block_group || block_group->ro)
2652        readonly = 1;
2653    if (block_group)
2654        btrfs_put_block_group(block_group);
2655    return readonly;
2656}
2657
2658static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2659                 u64 total_bytes, u64 bytes_used,
2660                 struct btrfs_space_info **space_info)
2661{
2662    struct btrfs_space_info *found;
2663
2664    found = __find_space_info(info, flags);
2665    if (found) {
2666        spin_lock(&found->lock);
2667        found->total_bytes += total_bytes;
2668        found->bytes_used += bytes_used;
2669        found->full = 0;
2670        spin_unlock(&found->lock);
2671        *space_info = found;
2672        return 0;
2673    }
2674    found = kzalloc(sizeof(*found), GFP_NOFS);
2675    if (!found)
2676        return -ENOMEM;
2677
2678    INIT_LIST_HEAD(&found->block_groups);
2679    init_rwsem(&found->groups_sem);
2680    init_waitqueue_head(&found->flush_wait);
2681    init_waitqueue_head(&found->allocate_wait);
2682    spin_lock_init(&found->lock);
2683    found->flags = flags;
2684    found->total_bytes = total_bytes;
2685    found->bytes_used = bytes_used;
2686    found->bytes_pinned = 0;
2687    found->bytes_reserved = 0;
2688    found->bytes_readonly = 0;
2689    found->bytes_delalloc = 0;
2690    found->full = 0;
2691    found->force_alloc = 0;
2692    *space_info = found;
2693    list_add_rcu(&found->list, &info->space_info);
2694    atomic_set(&found->caching_threads, 0);
2695    return 0;
2696}
2697
2698static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2699{
2700    u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
2701                   BTRFS_BLOCK_GROUP_RAID1 |
2702                   BTRFS_BLOCK_GROUP_RAID10 |
2703                   BTRFS_BLOCK_GROUP_DUP);
2704    if (extra_flags) {
2705        if (flags & BTRFS_BLOCK_GROUP_DATA)
2706            fs_info->avail_data_alloc_bits |= extra_flags;
2707        if (flags & BTRFS_BLOCK_GROUP_METADATA)
2708            fs_info->avail_metadata_alloc_bits |= extra_flags;
2709        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2710            fs_info->avail_system_alloc_bits |= extra_flags;
2711    }
2712}
2713
2714static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2715{
2716    spin_lock(&cache->space_info->lock);
2717    spin_lock(&cache->lock);
2718    if (!cache->ro) {
2719        cache->space_info->bytes_readonly += cache->key.offset -
2720                    btrfs_block_group_used(&cache->item);
2721        cache->ro = 1;
2722    }
2723    spin_unlock(&cache->lock);
2724    spin_unlock(&cache->space_info->lock);
2725}
2726
2727u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2728{
2729    u64 num_devices = root->fs_info->fs_devices->rw_devices;
2730
2731    if (num_devices == 1)
2732        flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
2733    if (num_devices < 4)
2734        flags &= ~BTRFS_BLOCK_GROUP_RAID10;
2735
2736    if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
2737        (flags & (BTRFS_BLOCK_GROUP_RAID1 |
2738              BTRFS_BLOCK_GROUP_RAID10))) {
2739        flags &= ~BTRFS_BLOCK_GROUP_DUP;
2740    }
2741
2742    if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
2743        (flags & BTRFS_BLOCK_GROUP_RAID10)) {
2744        flags &= ~BTRFS_BLOCK_GROUP_RAID1;
2745    }
2746
2747    if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
2748        ((flags & BTRFS_BLOCK_GROUP_RAID1) |
2749         (flags & BTRFS_BLOCK_GROUP_RAID10) |
2750         (flags & BTRFS_BLOCK_GROUP_DUP)))
2751        flags &= ~BTRFS_BLOCK_GROUP_RAID0;
2752    return flags;
2753}
2754
2755static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2756{
2757    struct btrfs_fs_info *info = root->fs_info;
2758    u64 alloc_profile;
2759
2760    if (data) {
2761        alloc_profile = info->avail_data_alloc_bits &
2762            info->data_alloc_profile;
2763        data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2764    } else if (root == root->fs_info->chunk_root) {
2765        alloc_profile = info->avail_system_alloc_bits &
2766            info->system_alloc_profile;
2767        data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2768    } else {
2769        alloc_profile = info->avail_metadata_alloc_bits &
2770            info->metadata_alloc_profile;
2771        data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2772    }
2773
2774    return btrfs_reduce_alloc_profile(root, data);
2775}
2776
2777void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2778{
2779    u64 alloc_target;
2780
2781    alloc_target = btrfs_get_alloc_profile(root, 1);
2782    BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2783                               alloc_target);
2784}
2785
2786static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2787{
2788    u64 num_bytes;
2789    int level;
2790
2791    level = BTRFS_MAX_LEVEL - 2;
2792    /*
2793     * NOTE: these calculations are absolutely the worst possible case.
2794     * This assumes that _every_ item we insert will require a new leaf, and
2795     * that the tree has grown to its maximum level size.
2796     */
2797
2798    /*
2799     * for every item we insert we could insert both an extent item and a
2800     * extent ref item. Then for ever item we insert, we will need to cow
2801     * both the original leaf, plus the leaf to the left and right of it.
2802     *
2803     * Unless we are talking about the extent root, then we just want the
2804     * number of items * 2, since we just need the extent item plus its ref.
2805     */
2806    if (root == root->fs_info->extent_root)
2807        num_bytes = num_items * 2;
2808    else
2809        num_bytes = (num_items + (2 * num_items)) * 3;
2810
2811    /*
2812     * num_bytes is total number of leaves we could need times the leaf
2813     * size, and then for every leaf we could end up cow'ing 2 nodes per
2814     * level, down to the leaf level.
2815     */
2816    num_bytes = (num_bytes * root->leafsize) +
2817        (num_bytes * (level * 2)) * root->nodesize;
2818
2819    return num_bytes;
2820}
2821
2822/*
2823 * Unreserve metadata space for delalloc. If we have less reserved credits than
2824 * we have extents, this function does nothing.
2825 */
2826int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2827                      struct inode *inode, int num_items)
2828{
2829    struct btrfs_fs_info *info = root->fs_info;
2830    struct btrfs_space_info *meta_sinfo;
2831    u64 num_bytes;
2832    u64 alloc_target;
2833    bool bug = false;
2834
2835    /* get the space info for where the metadata will live */
2836    alloc_target = btrfs_get_alloc_profile(root, 0);
2837    meta_sinfo = __find_space_info(info, alloc_target);
2838
2839    num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2840                       num_items);
2841
2842    spin_lock(&meta_sinfo->lock);
2843    spin_lock(&BTRFS_I(inode)->accounting_lock);
2844    if (BTRFS_I(inode)->reserved_extents <=
2845        BTRFS_I(inode)->outstanding_extents) {
2846        spin_unlock(&BTRFS_I(inode)->accounting_lock);
2847        spin_unlock(&meta_sinfo->lock);
2848        return 0;
2849    }
2850    spin_unlock(&BTRFS_I(inode)->accounting_lock);
2851
2852    BTRFS_I(inode)->reserved_extents -= num_items;
2853    BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2854
2855    if (meta_sinfo->bytes_delalloc < num_bytes) {
2856        bug = true;
2857        meta_sinfo->bytes_delalloc = 0;
2858    } else {
2859        meta_sinfo->bytes_delalloc -= num_bytes;
2860    }
2861    spin_unlock(&meta_sinfo->lock);
2862
2863    BUG_ON(bug);
2864
2865    return 0;
2866}
2867
2868static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2869{
2870    u64 thresh;
2871
2872    thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2873        meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2874        meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2875        meta_sinfo->bytes_may_use;
2876
2877    thresh = meta_sinfo->total_bytes - thresh;
2878    thresh *= 80;
2879    do_div(thresh, 100);
2880    if (thresh <= meta_sinfo->bytes_delalloc)
2881        meta_sinfo->force_delalloc = 1;
2882    else
2883        meta_sinfo->force_delalloc = 0;
2884}
2885
2886struct async_flush {
2887    struct btrfs_root *root;
2888    struct btrfs_space_info *info;
2889    struct btrfs_work work;
2890};
2891
2892static noinline void flush_delalloc_async(struct btrfs_work *work)
2893{
2894    struct async_flush *async;
2895    struct btrfs_root *root;
2896    struct btrfs_space_info *info;
2897
2898    async = container_of(work, struct async_flush, work);
2899    root = async->root;
2900    info = async->info;
2901
2902    btrfs_start_delalloc_inodes(root, 0);
2903    wake_up(&info->flush_wait);
2904    btrfs_wait_ordered_extents(root, 0, 0);
2905
2906    spin_lock(&info->lock);
2907    info->flushing = 0;
2908    spin_unlock(&info->lock);
2909    wake_up(&info->flush_wait);
2910
2911    kfree(async);
2912}
2913
2914static void wait_on_flush(struct btrfs_space_info *info)
2915{
2916    DEFINE_WAIT(wait);
2917    u64 used;
2918
2919    while (1) {
2920        prepare_to_wait(&info->flush_wait, &wait,
2921                TASK_UNINTERRUPTIBLE);
2922        spin_lock(&info->lock);
2923        if (!info->flushing) {
2924            spin_unlock(&info->lock);
2925            break;
2926        }
2927
2928        used = info->bytes_used + info->bytes_reserved +
2929            info->bytes_pinned + info->bytes_readonly +
2930            info->bytes_super + info->bytes_root +
2931            info->bytes_may_use + info->bytes_delalloc;
2932        if (used < info->total_bytes) {
2933            spin_unlock(&info->lock);
2934            break;
2935        }
2936        spin_unlock(&info->lock);
2937        schedule();
2938    }
2939    finish_wait(&info->flush_wait, &wait);
2940}
2941
2942static void flush_delalloc(struct btrfs_root *root,
2943                 struct btrfs_space_info *info)
2944{
2945    struct async_flush *async;
2946    bool wait = false;
2947
2948    spin_lock(&info->lock);
2949
2950    if (!info->flushing)
2951        info->flushing = 1;
2952    else
2953        wait = true;
2954
2955    spin_unlock(&info->lock);
2956
2957    if (wait) {
2958        wait_on_flush(info);
2959        return;
2960    }
2961
2962    async = kzalloc(sizeof(*async), GFP_NOFS);
2963    if (!async)
2964        goto flush;
2965
2966    async->root = root;
2967    async->info = info;
2968    async->work.func = flush_delalloc_async;
2969
2970    btrfs_queue_worker(&root->fs_info->enospc_workers,
2971               &async->work);
2972    wait_on_flush(info);
2973    return;
2974
2975flush:
2976    btrfs_start_delalloc_inodes(root, 0);
2977    btrfs_wait_ordered_extents(root, 0, 0);
2978
2979    spin_lock(&info->lock);
2980    info->flushing = 0;
2981    spin_unlock(&info->lock);
2982    wake_up(&info->flush_wait);
2983}
2984
2985static int maybe_allocate_chunk(struct btrfs_root *root,
2986                 struct btrfs_space_info *info)
2987{
2988    struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2989    struct btrfs_trans_handle *trans;
2990    bool wait = false;
2991    int ret = 0;
2992    u64 min_metadata;
2993    u64 free_space;
2994
2995    free_space = btrfs_super_total_bytes(disk_super);
2996    /*
2997     * we allow the metadata to grow to a max of either 10gb or 5% of the
2998     * space in the volume.
2999     */
3000    min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3001                 div64_u64(free_space * 5, 100));
3002    if (info->total_bytes >= min_metadata) {
3003        spin_unlock(&info->lock);
3004        return 0;
3005    }
3006
3007    if (info->full) {
3008        spin_unlock(&info->lock);
3009        return 0;
3010    }
3011
3012    if (!info->allocating_chunk) {
3013        info->force_alloc = 1;
3014        info->allocating_chunk = 1;
3015    } else {
3016        wait = true;
3017    }
3018
3019    spin_unlock(&info->lock);
3020
3021    if (wait) {
3022        wait_event(info->allocate_wait,
3023               !info->allocating_chunk);
3024        return 1;
3025    }
3026
3027    trans = btrfs_start_transaction(root, 1);
3028    if (!trans) {
3029        ret = -ENOMEM;
3030        goto out;
3031    }
3032
3033    ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034                 4096 + 2 * 1024 * 1024,
3035                 info->flags, 0);
3036    btrfs_end_transaction(trans, root);
3037    if (ret)
3038        goto out;
3039out:
3040    spin_lock(&info->lock);
3041    info->allocating_chunk = 0;
3042    spin_unlock(&info->lock);
3043    wake_up(&info->allocate_wait);
3044
3045    if (ret)
3046        return 0;
3047    return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054                    struct inode *inode, int num_items)
3055{
3056    struct btrfs_fs_info *info = root->fs_info;
3057    struct btrfs_space_info *meta_sinfo;
3058    u64 num_bytes;
3059    u64 used;
3060    u64 alloc_target;
3061    int flushed = 0;
3062    int force_delalloc;
3063
3064    /* get the space info for where the metadata will live */
3065    alloc_target = btrfs_get_alloc_profile(root, 0);
3066    meta_sinfo = __find_space_info(info, alloc_target);
3067
3068    num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069                       num_items);
3070again:
3071    spin_lock(&meta_sinfo->lock);
3072
3073    force_delalloc = meta_sinfo->force_delalloc;
3074
3075    if (unlikely(!meta_sinfo->bytes_root))
3076        meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078    if (!flushed)
3079        meta_sinfo->bytes_delalloc += num_bytes;
3080
3081    used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082        meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083        meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084        meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086    if (used > meta_sinfo->total_bytes) {
3087        flushed++;
3088
3089        if (flushed == 1) {
3090            if (maybe_allocate_chunk(root, meta_sinfo))
3091                goto again;
3092            flushed++;
3093        } else {
3094            spin_unlock(&meta_sinfo->lock);
3095        }
3096
3097        if (flushed == 2) {
3098            filemap_flush(inode->i_mapping);
3099            goto again;
3100        } else if (flushed == 3) {
3101            flush_delalloc(root, meta_sinfo);
3102            goto again;
3103        }
3104        spin_lock(&meta_sinfo->lock);
3105        meta_sinfo->bytes_delalloc -= num_bytes;
3106        spin_unlock(&meta_sinfo->lock);
3107        printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108               BTRFS_I(inode)->outstanding_extents,
3109               BTRFS_I(inode)->reserved_extents);
3110        dump_space_info(meta_sinfo, 0, 0);
3111        return -ENOSPC;
3112    }
3113
3114    BTRFS_I(inode)->reserved_extents += num_items;
3115    check_force_delalloc(meta_sinfo);
3116    spin_unlock(&meta_sinfo->lock);
3117
3118    if (!flushed && force_delalloc)
3119        filemap_flush(inode->i_mapping);
3120
3121    return 0;
3122}
3123
3124/*
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135    struct btrfs_fs_info *info = root->fs_info;
3136    struct btrfs_space_info *meta_sinfo;
3137    u64 num_bytes;
3138    u64 alloc_target;
3139    bool bug = false;
3140
3141    /* get the space info for where the metadata will live */
3142    alloc_target = btrfs_get_alloc_profile(root, 0);
3143    meta_sinfo = __find_space_info(info, alloc_target);
3144
3145    num_bytes = calculate_bytes_needed(root, num_items);
3146
3147    spin_lock(&meta_sinfo->lock);
3148    if (meta_sinfo->bytes_may_use < num_bytes) {
3149        bug = true;
3150        meta_sinfo->bytes_may_use = 0;
3151    } else {
3152        meta_sinfo->bytes_may_use -= num_bytes;
3153    }
3154    spin_unlock(&meta_sinfo->lock);
3155
3156    BUG_ON(bug);
3157
3158    return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{
3176    struct btrfs_fs_info *info = root->fs_info;
3177    struct btrfs_space_info *meta_sinfo;
3178    u64 num_bytes;
3179    u64 used;
3180    u64 alloc_target;
3181    int retries = 0;
3182
3183    /* get the space info for where the metadata will live */
3184    alloc_target = btrfs_get_alloc_profile(root, 0);
3185    meta_sinfo = __find_space_info(info, alloc_target);
3186
3187    num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189    spin_lock(&meta_sinfo->lock);
3190
3191    if (unlikely(!meta_sinfo->bytes_root))
3192        meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194    if (!retries)
3195        meta_sinfo->bytes_may_use += num_bytes;
3196
3197    used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198        meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199        meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200        meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202    if (used > meta_sinfo->total_bytes) {
3203        retries++;
3204        if (retries == 1) {
3205            if (maybe_allocate_chunk(root, meta_sinfo))
3206                goto again;
3207            retries++;
3208        } else {
3209            spin_unlock(&meta_sinfo->lock);
3210        }
3211
3212        if (retries == 2) {
3213            flush_delalloc(root, meta_sinfo);
3214            goto again;
3215        }
3216        spin_lock(&meta_sinfo->lock);
3217        meta_sinfo->bytes_may_use -= num_bytes;
3218        spin_unlock(&meta_sinfo->lock);
3219
3220        dump_space_info(meta_sinfo, 0, 0);
3221        return -ENOSPC;
3222    }
3223
3224    check_force_delalloc(meta_sinfo);
3225    spin_unlock(&meta_sinfo->lock);
3226
3227    return 0;
3228}
3229
3230/*
3231 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes.
3233 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3235                u64 bytes)
3236{
3237    struct btrfs_space_info *data_sinfo;
3238    u64 used;
3239    int ret = 0, committed = 0, flushed = 0;
3240
3241    /* make sure bytes are sectorsize aligned */
3242    bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3243
3244    data_sinfo = BTRFS_I(inode)->space_info;
3245    if (!data_sinfo)
3246        goto alloc;
3247
3248again:
3249    /* make sure we have enough space to handle the data first */
3250    spin_lock(&data_sinfo->lock);
3251    used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
3252        data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
3253        data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
3254        data_sinfo->bytes_super;
3255
3256    if (used + bytes > data_sinfo->total_bytes) {
3257        struct btrfs_trans_handle *trans;
3258
3259        if (!flushed) {
3260            spin_unlock(&data_sinfo->lock);
3261            flush_delalloc(root, data_sinfo);
3262            flushed = 1;
3263            goto again;
3264        }
3265
3266        /*
3267         * if we don't have enough free bytes in this space then we need
3268         * to alloc a new chunk.
3269         */
3270        if (!data_sinfo->full) {
3271            u64 alloc_target;
3272
3273            data_sinfo->force_alloc = 1;
3274            spin_unlock(&data_sinfo->lock);
3275alloc:
3276            alloc_target = btrfs_get_alloc_profile(root, 1);
3277            trans = btrfs_start_transaction(root, 1);
3278            if (!trans)
3279                return -ENOMEM;
3280
3281            ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3282                         bytes + 2 * 1024 * 1024,
3283                         alloc_target, 0);
3284            btrfs_end_transaction(trans, root);
3285            if (ret)
3286                return ret;
3287
3288            if (!data_sinfo) {
3289                btrfs_set_inode_space_info(root, inode);
3290                data_sinfo = BTRFS_I(inode)->space_info;
3291            }
3292            goto again;
3293        }
3294        spin_unlock(&data_sinfo->lock);
3295
3296        /* commit the current transaction and try again */
3297        if (!committed && !root->fs_info->open_ioctl_trans) {
3298            committed = 1;
3299            trans = btrfs_join_transaction(root, 1);
3300            if (!trans)
3301                return -ENOMEM;
3302            ret = btrfs_commit_transaction(trans, root);
3303            if (ret)
3304                return ret;
3305            goto again;
3306        }
3307
3308        printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
3309               ", %llu bytes_used, %llu bytes_reserved, "
3310               "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
3311               "%llu total\n", (unsigned long long)bytes,
3312               (unsigned long long)data_sinfo->bytes_delalloc,
3313               (unsigned long long)data_sinfo->bytes_used,
3314               (unsigned long long)data_sinfo->bytes_reserved,
3315               (unsigned long long)data_sinfo->bytes_pinned,
3316               (unsigned long long)data_sinfo->bytes_readonly,
3317               (unsigned long long)data_sinfo->bytes_may_use,
3318               (unsigned long long)data_sinfo->total_bytes);
3319        return -ENOSPC;
3320    }
3321    data_sinfo->bytes_may_use += bytes;
3322    BTRFS_I(inode)->reserved_bytes += bytes;
3323    spin_unlock(&data_sinfo->lock);
3324
3325    return 0;
3326}
3327
3328/*
3329 * if there was an error for whatever reason after calling
3330 * btrfs_check_data_free_space, call this so we can cleanup the counters.
3331 */
3332void btrfs_free_reserved_data_space(struct btrfs_root *root,
3333                    struct inode *inode, u64 bytes)
3334{
3335    struct btrfs_space_info *data_sinfo;
3336
3337    /* make sure bytes are sectorsize aligned */
3338    bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3339
3340    data_sinfo = BTRFS_I(inode)->space_info;
3341    spin_lock(&data_sinfo->lock);
3342    data_sinfo->bytes_may_use -= bytes;
3343    BTRFS_I(inode)->reserved_bytes -= bytes;
3344    spin_unlock(&data_sinfo->lock);
3345}
3346
3347/* called when we are adding a delalloc extent to the inode's io_tree */
3348void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3349                  u64 bytes)
3350{
3351    struct btrfs_space_info *data_sinfo;
3352
3353    /* get the space info for where this inode will be storing its data */
3354    data_sinfo = BTRFS_I(inode)->space_info;
3355
3356    /* make sure we have enough space to handle the data first */
3357    spin_lock(&data_sinfo->lock);
3358    data_sinfo->bytes_delalloc += bytes;
3359
3360    /*
3361     * we are adding a delalloc extent without calling
3362     * btrfs_check_data_free_space first. This happens on a weird
3363     * writepage condition, but shouldn't hurt our accounting
3364     */
3365    if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3366        data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3367        BTRFS_I(inode)->reserved_bytes = 0;
3368    } else {
3369        data_sinfo->bytes_may_use -= bytes;
3370        BTRFS_I(inode)->reserved_bytes -= bytes;
3371    }
3372
3373    spin_unlock(&data_sinfo->lock);
3374}
3375
3376/* called when we are clearing an delalloc extent from the inode's io_tree */
3377void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3378                  u64 bytes)
3379{
3380    struct btrfs_space_info *info;
3381
3382    info = BTRFS_I(inode)->space_info;
3383
3384    spin_lock(&info->lock);
3385    info->bytes_delalloc -= bytes;
3386    spin_unlock(&info->lock);
3387}
3388
3389static void force_metadata_allocation(struct btrfs_fs_info *info)
3390{
3391    struct list_head *head = &info->space_info;
3392    struct btrfs_space_info *found;
3393
3394    rcu_read_lock();
3395    list_for_each_entry_rcu(found, head, list) {
3396        if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3397            found->force_alloc = 1;
3398    }
3399    rcu_read_unlock();
3400}
3401
3402static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3403              struct btrfs_root *extent_root, u64 alloc_bytes,
3404              u64 flags, int force)
3405{
3406    struct btrfs_space_info *space_info;
3407    struct btrfs_fs_info *fs_info = extent_root->fs_info;
3408    u64 thresh;
3409    int ret = 0;
3410
3411    mutex_lock(&fs_info->chunk_mutex);
3412
3413    flags = btrfs_reduce_alloc_profile(extent_root, flags);
3414
3415    space_info = __find_space_info(extent_root->fs_info, flags);
3416    if (!space_info) {
3417        ret = update_space_info(extent_root->fs_info, flags,
3418                    0, 0, &space_info);
3419        BUG_ON(ret);
3420    }
3421    BUG_ON(!space_info);
3422
3423    spin_lock(&space_info->lock);
3424    if (space_info->force_alloc)
3425        force = 1;
3426    if (space_info->full) {
3427        spin_unlock(&space_info->lock);
3428        goto out;
3429    }
3430
3431    thresh = space_info->total_bytes - space_info->bytes_readonly;
3432    thresh = div_factor(thresh, 8);
3433    if (!force &&
3434       (space_info->bytes_used + space_info->bytes_pinned +
3435        space_info->bytes_reserved + alloc_bytes) < thresh) {
3436        spin_unlock(&space_info->lock);
3437        goto out;
3438    }
3439    spin_unlock(&space_info->lock);
3440
3441    /*
3442     * if we're doing a data chunk, go ahead and make sure that
3443     * we keep a reasonable number of metadata chunks allocated in the
3444     * FS as well.
3445     */
3446    if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3447        fs_info->data_chunk_allocations++;
3448        if (!(fs_info->data_chunk_allocations %
3449              fs_info->metadata_ratio))
3450            force_metadata_allocation(fs_info);
3451    }
3452
3453    ret = btrfs_alloc_chunk(trans, extent_root, flags);
3454    spin_lock(&space_info->lock);
3455    if (ret)
3456        space_info->full = 1;
3457    space_info->force_alloc = 0;
3458    spin_unlock(&space_info->lock);
3459out:
3460    mutex_unlock(&extent_root->fs_info->chunk_mutex);
3461    return ret;
3462}
3463
3464static int update_block_group(struct btrfs_trans_handle *trans,
3465                  struct btrfs_root *root,
3466                  u64 bytenr, u64 num_bytes, int alloc,
3467                  int mark_free)
3468{
3469    struct btrfs_block_group_cache *cache;
3470    struct btrfs_fs_info *info = root->fs_info;
3471    u64 total = num_bytes;
3472    u64 old_val;
3473    u64 byte_in_group;
3474
3475    /* block accounting for super block */
3476    spin_lock(&info->delalloc_lock);
3477    old_val = btrfs_super_bytes_used(&info->super_copy);
3478    if (alloc)
3479        old_val += num_bytes;
3480    else
3481        old_val -= num_bytes;
3482    btrfs_set_super_bytes_used(&info->super_copy, old_val);
3483    spin_unlock(&info->delalloc_lock);
3484
3485    while (total) {
3486        cache = btrfs_lookup_block_group(info, bytenr);
3487        if (!cache)
3488            return -1;
3489        byte_in_group = bytenr - cache->key.objectid;
3490        WARN_ON(byte_in_group > cache->key.offset);
3491
3492        spin_lock(&cache->space_info->lock);
3493        spin_lock(&cache->lock);
3494        cache->dirty = 1;
3495        old_val = btrfs_block_group_used(&cache->item);
3496        num_bytes = min(total, cache->key.offset - byte_in_group);
3497        if (alloc) {
3498            old_val += num_bytes;
3499            btrfs_set_block_group_used(&cache->item, old_val);
3500            cache->reserved -= num_bytes;
3501            cache->space_info->bytes_used += num_bytes;
3502            cache->space_info->bytes_reserved -= num_bytes;
3503            if (cache->ro)
3504                cache->space_info->bytes_readonly -= num_bytes;
3505            spin_unlock(&cache->lock);
3506            spin_unlock(&cache->space_info->lock);
3507        } else {
3508            old_val -= num_bytes;
3509            cache->space_info->bytes_used -= num_bytes;
3510            if (cache->ro)
3511                cache->space_info->bytes_readonly += num_bytes;
3512            btrfs_set_block_group_used(&cache->item, old_val);
3513            spin_unlock(&cache->lock);
3514            spin_unlock(&cache->space_info->lock);
3515            if (mark_free) {
3516                int ret;
3517
3518                ret = btrfs_discard_extent(root, bytenr,
3519                               num_bytes);
3520                WARN_ON(ret);
3521
3522                ret = btrfs_add_free_space(cache, bytenr,
3523                               num_bytes);
3524                WARN_ON(ret);
3525            }
3526        }
3527        btrfs_put_block_group(cache);
3528        total -= num_bytes;
3529        bytenr += num_bytes;
3530    }
3531    return 0;
3532}
3533
3534static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3535{
3536    struct btrfs_block_group_cache *cache;
3537    u64 bytenr;
3538
3539    cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
3540    if (!cache)
3541        return 0;
3542
3543    bytenr = cache->key.objectid;
3544    btrfs_put_block_group(cache);
3545
3546    return bytenr;
3547}
3548
3549/*
3550 * this function must be called within transaction
3551 */
3552int btrfs_pin_extent(struct btrfs_root *root,
3553             u64 bytenr, u64 num_bytes, int reserved)
3554{
3555    struct btrfs_fs_info *fs_info = root->fs_info;
3556    struct btrfs_block_group_cache *cache;
3557
3558    cache = btrfs_lookup_block_group(fs_info, bytenr);
3559    BUG_ON(!cache);
3560
3561    spin_lock(&cache->space_info->lock);
3562    spin_lock(&cache->lock);
3563    cache->pinned += num_bytes;
3564    cache->space_info->bytes_pinned += num_bytes;
3565    if (reserved) {
3566        cache->reserved -= num_bytes;
3567        cache->space_info->bytes_reserved -= num_bytes;
3568    }
3569    spin_unlock(&cache->lock);
3570    spin_unlock(&cache->space_info->lock);
3571
3572    btrfs_put_block_group(cache);
3573
3574    set_extent_dirty(fs_info->pinned_extents,
3575             bytenr, bytenr + num_bytes - 1, GFP_NOFS);
3576    return 0;
3577}
3578
3579static int update_reserved_extents(struct btrfs_block_group_cache *cache,
3580                   u64 num_bytes, int reserve)
3581{
3582    spin_lock(&cache->space_info->lock);
3583    spin_lock(&cache->lock);
3584    if (reserve) {
3585        cache->reserved += num_bytes;
3586        cache->space_info->bytes_reserved += num_bytes;
3587    } else {
3588        cache->reserved -= num_bytes;
3589        cache->space_info->bytes_reserved -= num_bytes;
3590    }
3591    spin_unlock(&cache->lock);
3592    spin_unlock(&cache->space_info->lock);
3593    return 0;
3594}
3595
3596int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3597                struct btrfs_root *root)
3598{
3599    struct btrfs_fs_info *fs_info = root->fs_info;
3600    struct btrfs_caching_control *next;
3601    struct btrfs_caching_control *caching_ctl;
3602    struct btrfs_block_group_cache *cache;
3603
3604    down_write(&fs_info->extent_commit_sem);
3605
3606    list_for_each_entry_safe(caching_ctl, next,
3607                 &fs_info->caching_block_groups, list) {
3608        cache = caching_ctl->block_group;
3609        if (block_group_cache_done(cache)) {
3610            cache->last_byte_to_unpin = (u64)-1;
3611            list_del_init(&caching_ctl->list);
3612            put_caching_control(caching_ctl);
3613        } else {
3614            cache->last_byte_to_unpin = caching_ctl->progress;
3615        }
3616    }
3617
3618    if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3619        fs_info->pinned_extents = &fs_info->freed_extents[1];
3620    else
3621        fs_info->pinned_extents = &fs_info->freed_extents[0];
3622
3623    up_write(&fs_info->extent_commit_sem);
3624    return 0;
3625}
3626
3627static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3628{
3629    struct btrfs_fs_info *fs_info = root->fs_info;
3630    struct btrfs_block_group_cache *cache = NULL;
3631    u64 len;
3632
3633    while (start <= end) {
3634        if (!cache ||
3635            start >= cache->key.objectid + cache->key.offset) {
3636            if (cache)
3637                btrfs_put_block_group(cache);
3638            cache = btrfs_lookup_block_group(fs_info, start);
3639            BUG_ON(!cache);
3640        }
3641
3642        len = cache->key.objectid + cache->key.offset - start;
3643        len = min(len, end + 1 - start);
3644
3645        if (start < cache->last_byte_to_unpin) {
3646            len = min(len, cache->last_byte_to_unpin - start);
3647            btrfs_add_free_space(cache, start, len);
3648        }
3649
3650        spin_lock(&cache->space_info->lock);
3651        spin_lock(&cache->lock);
3652        cache->pinned -= len;
3653        cache->space_info->bytes_pinned -= len;
3654        spin_unlock(&cache->lock);
3655        spin_unlock(&cache->space_info->lock);
3656
3657        start += len;
3658    }
3659
3660    if (cache)
3661        btrfs_put_block_group(cache);
3662    return 0;
3663}
3664
3665int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3666                   struct btrfs_root *root)
3667{
3668    struct btrfs_fs_info *fs_info = root->fs_info;
3669    struct extent_io_tree *unpin;
3670    u64 start;
3671    u64 end;
3672    int ret;
3673
3674    if (fs_info->pinned_extents == &fs_info->freed_extents[0])
3675        unpin = &fs_info->freed_extents[1];
3676    else
3677        unpin = &fs_info->freed_extents[0];
3678
3679    while (1) {
3680        ret = find_first_extent_bit(unpin, 0, &start, &end,
3681                        EXTENT_DIRTY);
3682        if (ret)
3683            break;
3684
3685        ret = btrfs_discard_extent(root, start, end + 1 - start);
3686
3687        clear_extent_dirty(unpin, start, end, GFP_NOFS);
3688        unpin_extent_range(root, start, end);
3689        cond_resched();
3690    }
3691
3692    return ret;
3693}
3694
3695static int pin_down_bytes(struct btrfs_trans_handle *trans,
3696              struct btrfs_root *root,
3697              struct btrfs_path *path,
3698              u64 bytenr, u64 num_bytes,
3699              int is_data, int reserved,
3700              struct extent_buffer **must_clean)
3701{
3702    int err = 0;
3703    struct extent_buffer *buf;
3704
3705    if (is_data)
3706        goto pinit;
3707
3708    /*
3709     * discard is sloooow, and so triggering discards on
3710     * individual btree blocks isn't a good plan. Just
3711     * pin everything in discard mode.
3712     */
3713    if (btrfs_test_opt(root, DISCARD))
3714        goto pinit;
3715
3716    buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3717    if (!buf)
3718        goto pinit;
3719
3720    /* we can reuse a block if it hasn't been written
3721     * and it is from this transaction. We can't
3722     * reuse anything from the tree log root because
3723     * it has tiny sub-transactions.
3724     */
3725    if (btrfs_buffer_uptodate(buf, 0) &&
3726        btrfs_try_tree_lock(buf)) {
3727        u64 header_owner = btrfs_header_owner(buf);
3728        u64 header_transid = btrfs_header_generation(buf);
3729        if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3730            header_transid == trans->transid &&
3731            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3732            *must_clean = buf;
3733            return 1;
3734        }
3735        btrfs_tree_unlock(buf);
3736    }
3737    free_extent_buffer(buf);
3738pinit:
3739    if (path)
3740        btrfs_set_path_blocking(path);
3741    /* unlocks the pinned mutex */
3742    btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3743
3744    BUG_ON(err < 0);
3745    return 0;
3746}
3747
3748static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3749                struct btrfs_root *root,
3750                u64 bytenr, u64 num_bytes, u64 parent,
3751                u64 root_objectid, u64 owner_objectid,
3752                u64 owner_offset, int refs_to_drop,
3753                struct btrfs_delayed_extent_op *extent_op)
3754{
3755    struct btrfs_key key;
3756    struct btrfs_path *path;
3757    struct btrfs_fs_info *info = root->fs_info;
3758    struct btrfs_root *extent_root = info->extent_root;
3759    struct extent_buffer *leaf;
3760    struct btrfs_extent_item *ei;
3761    struct btrfs_extent_inline_ref *iref;
3762    int ret;
3763    int is_data;
3764    int extent_slot = 0;
3765    int found_extent = 0;
3766    int num_to_del = 1;
3767    u32 item_size;
3768    u64 refs;
3769
3770    path = btrfs_alloc_path();
3771    if (!path)
3772        return -ENOMEM;
3773
3774    path->reada = 1;
3775    path->leave_spinning = 1;
3776
3777    is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3778    BUG_ON(!is_data && refs_to_drop != 1);
3779
3780    ret = lookup_extent_backref(trans, extent_root, path, &iref,
3781                    bytenr, num_bytes, parent,
3782                    root_objectid, owner_objectid,
3783                    owner_offset);
3784    if (ret == 0) {
3785        extent_slot = path->slots[0];
3786        while (extent_slot >= 0) {
3787            btrfs_item_key_to_cpu(path->nodes[0], &key,
3788                          extent_slot);
3789            if (key.objectid != bytenr)
3790                break;
3791            if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3792                key.offset == num_bytes) {
3793                found_extent = 1;
3794                break;
3795            }
3796            if (path->slots[0] - extent_slot > 5)
3797                break;
3798            extent_slot--;
3799        }
3800#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3801        item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
3802        if (found_extent && item_size < sizeof(*ei))
3803            found_extent = 0;
3804#endif
3805        if (!found_extent) {
3806            BUG_ON(iref);
3807            ret = remove_extent_backref(trans, extent_root, path,
3808                            NULL, refs_to_drop,
3809                            is_data);
3810            BUG_ON(ret);
3811            btrfs_release_path(extent_root, path);
3812            path->leave_spinning = 1;
3813
3814            key.objectid = bytenr;
3815            key.type = BTRFS_EXTENT_ITEM_KEY;
3816            key.offset = num_bytes;
3817
3818            ret = btrfs_search_slot(trans, extent_root,
3819                        &key, path, -1, 1);
3820            if (ret) {
3821                printk(KERN_ERR "umm, got %d back from search"
3822                       ", was looking for %llu\n", ret,
3823                       (unsigned long long)bytenr);
3824                btrfs_print_leaf(extent_root, path->nodes[0]);
3825            }
3826            BUG_ON(ret);
3827            extent_slot = path->slots[0];
3828        }
3829    } else {
3830        btrfs_print_leaf(extent_root, path->nodes[0]);
3831        WARN_ON(1);
3832        printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
3833               "parent %llu root %llu owner %llu offset %llu\n",
3834               (unsigned long long)bytenr,
3835               (unsigned long long)parent,
3836               (unsigned long long)root_objectid,
3837               (unsigned long long)owner_objectid,
3838               (unsigned long long)owner_offset);
3839    }
3840
3841    leaf = path->nodes[0];
3842    item_size = btrfs_item_size_nr(leaf, extent_slot);
3843#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3844    if (item_size < sizeof(*ei)) {
3845        BUG_ON(found_extent || extent_slot != path->slots[0]);
3846        ret = convert_extent_item_v0(trans, extent_root, path,
3847                         owner_objectid, 0);
3848        BUG_ON(ret < 0);
3849
3850        btrfs_release_path(extent_root, path);
3851        path->leave_spinning = 1;
3852
3853        key.objectid = bytenr;
3854        key.type = BTRFS_EXTENT_ITEM_KEY;
3855        key.offset = num_bytes;
3856
3857        ret = btrfs_search_slot(trans, extent_root, &key, path,
3858                    -1, 1);
3859        if (ret) {
3860            printk(KERN_ERR "umm, got %d back from search"
3861                   ", was looking for %llu\n", ret,
3862                   (unsigned long long)bytenr);
3863            btrfs_print_leaf(extent_root, path->nodes[0]);
3864        }
3865        BUG_ON(ret);
3866        extent_slot = path->slots[0];
3867        leaf = path->nodes[0];
3868        item_size = btrfs_item_size_nr(leaf, extent_slot);
3869    }
3870#endif
3871    BUG_ON(item_size < sizeof(*ei));
3872    ei = btrfs_item_ptr(leaf, extent_slot,
3873                struct btrfs_extent_item);
3874    if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3875        struct btrfs_tree_block_info *bi;
3876        BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3877        bi = (struct btrfs_tree_block_info *)(ei + 1);
3878        WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3879    }
3880
3881    refs = btrfs_extent_refs(leaf, ei);
3882    BUG_ON(refs < refs_to_drop);
3883    refs -= refs_to_drop;
3884
3885    if (refs > 0) {
3886        if (extent_op)
3887            __run_delayed_extent_op(extent_op, leaf, ei);
3888        /*
3889         * In the case of inline back ref, reference count will
3890         * be updated by remove_extent_backref
3891         */
3892        if (iref) {
3893            BUG_ON(!found_extent);
3894        } else {
3895            btrfs_set_extent_refs(leaf, ei, refs);
3896            btrfs_mark_buffer_dirty(leaf);
3897        }
3898        if (found_extent) {
3899            ret = remove_extent_backref(trans, extent_root, path,
3900                            iref, refs_to_drop,
3901                            is_data);
3902            BUG_ON(ret);
3903        }
3904    } else {
3905        int mark_free = 0;
3906        struct extent_buffer *must_clean = NULL;
3907
3908        if (found_extent) {
3909            BUG_ON(is_data && refs_to_drop !=
3910                   extent_data_ref_count(root, path, iref));
3911            if (iref) {
3912                BUG_ON(path->slots[0] != extent_slot);
3913            } else {
3914                BUG_ON(path->slots[0] != extent_slot + 1);
3915                path->slots[0] = extent_slot;
3916                num_to_del = 2;
3917            }
3918        }
3919
3920        ret = pin_down_bytes(trans, root, path, bytenr,
3921                     num_bytes, is_data, 0, &must_clean);
3922        if (ret > 0)
3923            mark_free = 1;
3924        BUG_ON(ret < 0);
3925        /*
3926         * it is going to be very rare for someone to be waiting
3927         * on the block we're freeing. del_items might need to
3928         * schedule, so rather than get fancy, just force it
3929         * to blocking here
3930         */
3931        if (must_clean)
3932            btrfs_set_lock_blocking(must_clean);
3933
3934        ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3935                      num_to_del);
3936        BUG_ON(ret);
3937        btrfs_release_path(extent_root, path);
3938
3939        if (must_clean) {
3940            clean_tree_block(NULL, root, must_clean);
3941            btrfs_tree_unlock(must_clean);
3942            free_extent_buffer(must_clean);
3943        }
3944
3945        if (is_data) {
3946            ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3947            BUG_ON(ret);
3948        } else {
3949            invalidate_mapping_pages(info->btree_inode->i_mapping,
3950                 bytenr >> PAGE_CACHE_SHIFT,
3951                 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3952        }
3953
3954        ret = update_block_group(trans, root, bytenr, num_bytes, 0,
3955                     mark_free);
3956        BUG_ON(ret);
3957    }
3958    btrfs_free_path(path);
3959    return ret;
3960}
3961
3962/*
3963 * when we free an extent, it is possible (and likely) that we free the last
3964 * delayed ref for that extent as well. This searches the delayed ref tree for
3965 * a given extent, and if there are no other delayed refs to be processed, it
3966 * removes it from the tree.
3967 */
3968static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3969                      struct btrfs_root *root, u64 bytenr)
3970{
3971    struct btrfs_delayed_ref_head *head;
3972    struct btrfs_delayed_ref_root *delayed_refs;
3973    struct btrfs_delayed_ref_node *ref;
3974    struct rb_node *node;
3975    int ret;
3976
3977    delayed_refs = &trans->transaction->delayed_refs;
3978    spin_lock(&delayed_refs->lock);
3979    head = btrfs_find_delayed_ref_head(trans, bytenr);
3980    if (!head)
3981        goto out;
3982
3983    node = rb_prev(&head->node.rb_node);
3984    if (!node)
3985        goto out;
3986
3987    ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
3988
3989    /* there are still entries for this ref, we can't drop it */
3990    if (ref->bytenr == bytenr)
3991        goto out;
3992
3993    if (head->extent_op) {
3994        if (!head->must_insert_reserved)
3995            goto out;
3996        kfree(head->extent_op);
3997        head->extent_op = NULL;
3998    }
3999
4000    /*
4001     * waiting for the lock here would deadlock. If someone else has it
4002     * locked they are already in the process of dropping it anyway
4003     */
4004    if (!mutex_trylock(&head->mutex))
4005        goto out;
4006
4007    /*
4008     * at this point we have a head with no other entries. Go
4009     * ahead and process it.
4010     */
4011    head->node.in_tree = 0;
4012    rb_erase(&head->node.rb_node, &delayed_refs->root);
4013
4014    delayed_refs->num_entries--;
4015
4016    /*
4017     * we don't take a ref on the node because we're removing it from the
4018     * tree, so we just steal the ref the tree was holding.
4019     */
4020    delayed_refs->num_heads--;
4021    if (list_empty(&head->cluster))
4022        delayed_refs->num_heads_ready--;
4023
4024    list_del_init(&head->cluster);
4025    spin_unlock(&delayed_refs->lock);
4026
4027    ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
4028                  &head->node, head->extent_op,
4029                  head->must_insert_reserved);
4030    BUG_ON(ret);
4031    btrfs_put_delayed_ref(&head->node);
4032    return 0;
4033out:
4034    spin_unlock(&delayed_refs->lock);
4035    return 0;
4036}
4037
4038int btrfs_free_extent(struct btrfs_trans_handle *trans,
4039              struct btrfs_root *root,
4040              u64 bytenr, u64 num_bytes, u64 parent,
4041              u64 root_objectid, u64 owner, u64 offset)
4042{
4043    int ret;
4044
4045    /*
4046     * tree log blocks never actually go into the extent allocation
4047     * tree, just update pinning info and exit early.
4048     */
4049    if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4050        WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4051        /* unlocks the pinned mutex */
4052        btrfs_pin_extent(root, bytenr, num_bytes, 1);
4053        ret = 0;
4054    } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4055        ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4056                    parent, root_objectid, (int)owner,
4057                    BTRFS_DROP_DELAYED_REF, NULL);
4058        BUG_ON(ret);
4059        ret = check_ref_cleanup(trans, root, bytenr);
4060        BUG_ON(ret);
4061    } else {
4062        ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4063                    parent, root_objectid, owner,
4064                    offset, BTRFS_DROP_DELAYED_REF, NULL);
4065        BUG_ON(ret);
4066    }
4067    return ret;
4068}
4069
4070int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4071              struct btrfs_root *root,
4072              u64 bytenr, u32 blocksize,
4073              u64 parent, u64 root_objectid, int level)
4074{
4075    u64 used;
4076    spin_lock(&root->node_lock);
4077    used = btrfs_root_used(&root->root_item) - blocksize;
4078    btrfs_set_root_used(&root->root_item, used);
4079    spin_unlock(&root->node_lock);
4080
4081    return btrfs_free_extent(trans, root, bytenr, blocksize,
4082                 parent, root_objectid, level, 0);
4083}
4084
4085static u64 stripe_align(struct btrfs_root *root, u64 val)
4086{
4087    u64 mask = ((u64)root->stripesize - 1);
4088    u64 ret = (val + mask) & ~mask;
4089    return ret;
4090}
4091
4092/*
4093 * when we wait for progress in the block group caching, its because
4094 * our allocation attempt failed at least once. So, we must sleep
4095 * and let some progress happen before we try again.
4096 *
4097 * This function will sleep at least once waiting for new free space to
4098 * show up, and then it will check the block group free space numbers
4099 * for our min num_bytes. Another option is to have it go ahead
4100 * and look in the rbtree for a free extent of a given size, but this
4101 * is a good start.
4102 */
4103static noinline int
4104wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4105                u64 num_bytes)
4106{
4107    struct btrfs_caching_control *caching_ctl;
4108    DEFINE_WAIT(wait);
4109
4110    caching_ctl = get_caching_control(cache);
4111    if (!caching_ctl)
4112        return 0;
4113
4114    wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4115           (cache->free_space >= num_bytes));
4116
4117    put_caching_control(caching_ctl);
4118    return 0;
4119}
4120
4121static noinline int
4122wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4123{
4124    struct btrfs_caching_control *caching_ctl;
4125    DEFINE_WAIT(wait);
4126
4127    caching_ctl = get_caching_control(cache);
4128    if (!caching_ctl)
4129        return 0;
4130
4131    wait_event(caching_ctl->wait, block_group_cache_done(cache));
4132
4133    put_caching_control(caching_ctl);
4134    return 0;
4135}
4136
4137enum btrfs_loop_type {
4138    LOOP_FIND_IDEAL = 0,
4139    LOOP_CACHING_NOWAIT = 1,
4140    LOOP_CACHING_WAIT = 2,
4141    LOOP_ALLOC_CHUNK = 3,
4142    LOOP_NO_EMPTY_SIZE = 4,
4143};
4144
4145/*
4146 * walks the btree of allocated extents and find a hole of a given size.
4147 * The key ins is changed to record the hole:
4148 * ins->objectid == block start
4149 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4150 * ins->offset == number of blocks
4151 * Any available blocks before search_start are skipped.
4152 */
4153static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4154                     struct btrfs_root *orig_root,
4155                     u64 num_bytes, u64 empty_size,
4156                     u64 search_start, u64 search_end,
4157                     u64 hint_byte, struct btrfs_key *ins,
4158                     u64 exclude_start, u64 exclude_nr,
4159                     int data)
4160{
4161    int ret = 0;
4162    struct btrfs_root *root = orig_root->fs_info->extent_root;
4163    struct btrfs_free_cluster *last_ptr = NULL;
4164    struct btrfs_block_group_cache *block_group = NULL;
4165    int empty_cluster = 2 * 1024 * 1024;
4166    int allowed_chunk_alloc = 0;
4167    int done_chunk_alloc = 0;
4168    struct btrfs_space_info *space_info;
4169    int last_ptr_loop = 0;
4170    int loop = 0;
4171    bool found_uncached_bg = false;
4172    bool failed_cluster_refill = false;
4173    bool failed_alloc = false;
4174    u64 ideal_cache_percent = 0;
4175    u64 ideal_cache_offset = 0;
4176
4177    WARN_ON(num_bytes < root->sectorsize);
4178    btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
4179    ins->objectid = 0;
4180    ins->offset = 0;
4181
4182    space_info = __find_space_info(root->fs_info, data);
4183    if (!space_info) {
4184        printk(KERN_ERR "No space info for %d\n", data);
4185        return -ENOSPC;
4186    }
4187
4188    if (orig_root->ref_cows || empty_size)
4189        allowed_chunk_alloc = 1;
4190
4191    if (data & BTRFS_BLOCK_GROUP_METADATA) {
4192        last_ptr = &root->fs_info->meta_alloc_cluster;
4193        if (!btrfs_test_opt(root, SSD))
4194            empty_cluster = 64 * 1024;
4195    }
4196
4197    if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
4198        last_ptr = &root->fs_info->data_alloc_cluster;
4199    }
4200
4201    if (last_ptr) {
4202        spin_lock(&last_ptr->lock);
4203        if (last_ptr->block_group)
4204            hint_byte = last_ptr->window_start;
4205        spin_unlock(&last_ptr->lock);
4206    }
4207
4208    search_start = max(search_start, first_logical_byte(root, 0));
4209    search_start = max(search_start, hint_byte);
4210
4211    if (!last_ptr)
4212        empty_cluster = 0;
4213
4214    if (search_start == hint_byte) {
4215ideal_cache:
4216        block_group = btrfs_lookup_block_group(root->fs_info,
4217                               search_start);
4218        /*
4219         * we don't want to use the block group if it doesn't match our
4220         * allocation bits, or if its not cached.
4221         *
4222         * However if we are re-searching with an ideal block group
4223         * picked out then we don't care that the block group is cached.
4224         */
4225        if (block_group && block_group_bits(block_group, data) &&
4226            (block_group->cached != BTRFS_CACHE_NO ||
4227             search_start == ideal_cache_offset)) {
4228            down_read(&space_info->groups_sem);
4229            if (list_empty(&block_group->list) ||
4230                block_group->ro) {
4231                /*
4232                 * someone is removing this block group,
4233                 * we can't jump into the have_block_group
4234                 * target because our list pointers are not
4235                 * valid
4236                 */
4237                btrfs_put_block_group(block_group);
4238                up_read(&space_info->groups_sem);
4239            } else {
4240                goto have_block_group;
4241            }
4242        } else if (block_group) {
4243            btrfs_put_block_group(block_group);
4244        }
4245    }
4246search:
4247    down_read(&space_info->groups_sem);
4248    list_for_each_entry(block_group, &space_info->block_groups, list) {
4249        u64 offset;
4250        int cached;
4251
4252        btrfs_get_block_group(block_group);
4253        search_start = block_group->key.objectid;
4254
4255have_block_group:
4256        if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4257            u64 free_percent;
4258
4259            free_percent = btrfs_block_group_used(&block_group->item);
4260            free_percent *= 100;
4261            free_percent = div64_u64(free_percent,
4262                         block_group->key.offset);
4263            free_percent = 100 - free_percent;
4264            if (free_percent > ideal_cache_percent &&
4265                likely(!block_group->ro)) {
4266                ideal_cache_offset = block_group->key.objectid;
4267                ideal_cache_percent = free_percent;
4268            }
4269
4270            /*
4271             * We only want to start kthread caching if we are at
4272             * the point where we will wait for caching to make
4273             * progress, or if our ideal search is over and we've
4274             * found somebody to start caching.
4275             */
4276            if (loop > LOOP_CACHING_NOWAIT ||
4277                (loop > LOOP_FIND_IDEAL &&
4278                 atomic_read(&space_info->caching_threads) < 2)) {
4279                ret = cache_block_group(block_group);
4280                BUG_ON(ret);
4281            }
4282            found_uncached_bg = true;
4283
4284            /*
4285             * If loop is set for cached only, try the next block
4286             * group.
4287             */
4288            if (loop == LOOP_FIND_IDEAL)
4289                goto loop;
4290        }
4291
4292        cached = block_group_cache_done(block_group);
4293        if (unlikely(!cached))
4294            found_uncached_bg = true;
4295
4296        if (unlikely(block_group->ro))
4297            goto loop;
4298
4299        /*
4300         * Ok we want to try and use the cluster allocator, so lets look
4301         * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
4302         * have tried the cluster allocator plenty of times at this
4303         * point and not have found anything, so we are likely way too
4304         * fragmented for the clustering stuff to find anything, so lets
4305         * just skip it and let the allocator find whatever block it can
4306         * find
4307         */
4308        if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
4309            /*
4310             * the refill lock keeps out other
4311             * people trying to start a new cluster
4312             */
4313            spin_lock(&last_ptr->refill_lock);
4314            if (last_ptr->block_group &&
4315                (last_ptr->block_group->ro ||
4316                !block_group_bits(last_ptr->block_group, data))) {
4317                offset = 0;
4318                goto refill_cluster;
4319            }
4320
4321            offset = btrfs_alloc_from_cluster(block_group, last_ptr,
4322                         num_bytes, search_start);
4323            if (offset) {
4324                /* we have a block, we're done */
4325                spin_unlock(&last_ptr->refill_lock);
4326                goto checks;
4327            }
4328
4329            spin_lock(&last_ptr->lock);
4330            /*
4331             * whoops, this cluster doesn't actually point to
4332             * this block group. Get a ref on the block
4333             * group is does point to and try again
4334             */
4335            if (!last_ptr_loop && last_ptr->block_group &&
4336                last_ptr->block_group != block_group) {
4337
4338                btrfs_put_block_group(block_group);
4339                block_group = last_ptr->block_group;
4340                btrfs_get_block_group(block_group);
4341                spin_unlock(&last_ptr->lock);
4342                spin_unlock(&last_ptr->refill_lock);
4343
4344                last_ptr_loop = 1;
4345                search_start = block_group->key.objectid;
4346                /*
4347                 * we know this block group is properly
4348                 * in the list because
4349                 * btrfs_remove_block_group, drops the
4350                 * cluster before it removes the block
4351                 * group from the list
4352                 */
4353                goto have_block_group;
4354            }
4355            spin_unlock(&last_ptr->lock);
4356refill_cluster:
4357            /*
4358             * this cluster didn't work out, free it and
4359             * start over
4360             */
4361            btrfs_return_cluster_to_free_space(NULL, last_ptr);
4362
4363            last_ptr_loop = 0;
4364
4365            /* allocate a cluster in this block group */
4366            ret = btrfs_find_space_cluster(trans, root,
4367                           block_group, last_ptr,
4368                           offset, num_bytes,
4369                           empty_cluster + empty_size);
4370            if (ret == 0) {
4371                /*
4372                 * now pull our allocation out of this
4373                 * cluster
4374                 */
4375                offset = btrfs_alloc_from_cluster(block_group,
4376                          last_ptr, num_bytes,
4377                          search_start);
4378                if (offset) {
4379                    /* we found one, proceed */
4380                    spin_unlock(&last_ptr->refill_lock);
4381                    goto checks;
4382                }
4383            } else if (!cached && loop > LOOP_CACHING_NOWAIT
4384                   && !failed_cluster_refill) {
4385                spin_unlock(&last_ptr->refill_lock);
4386
4387                failed_cluster_refill = true;
4388                wait_block_group_cache_progress(block_group,
4389                       num_bytes + empty_cluster + empty_size);
4390                goto have_block_group;
4391            }
4392
4393            /*
4394             * at this point we either didn't find a cluster
4395             * or we weren't able to allocate a block from our
4396             * cluster. Free the cluster we've been trying
4397             * to use, and go to the next block group
4398             */
4399            btrfs_return_cluster_to_free_space(NULL, last_ptr);
4400            spin_unlock(&last_ptr->refill_lock);
4401            goto loop;
4402        }
4403
4404        offset = btrfs_find_space_for_alloc(block_group, search_start,
4405                            num_bytes, empty_size);
4406        /*
4407         * If we didn't find a chunk, and we haven't failed on this
4408         * block group before, and this block group is in the middle of
4409         * caching and we are ok with waiting, then go ahead and wait
4410         * for progress to be made, and set failed_alloc to true.
4411         *
4412         * If failed_alloc is true then we've already waited on this
4413         * block group once and should move on to the next block group.
4414         */
4415        if (!offset && !failed_alloc && !cached &&
4416            loop > LOOP_CACHING_NOWAIT) {
4417            wait_block_group_cache_progress(block_group,
4418                        num_bytes + empty_size);
4419            failed_alloc = true;
4420            goto have_block_group;
4421        } else if (!offset) {
4422            goto loop;
4423        }
4424checks:
4425        search_start = stripe_align(root, offset);
4426        /* move on to the next group */
4427        if (search_start + num_bytes >= search_end) {
4428            btrfs_add_free_space(block_group, offset, num_bytes);
4429            goto loop;
4430        }
4431
4432        /* move on to the next group */
4433        if (search_start + num_bytes >
4434            block_group->key.objectid + block_group->key.offset) {
4435            btrfs_add_free_space(block_group, offset, num_bytes);
4436            goto loop;
4437        }
4438
4439        if (exclude_nr > 0 &&
4440            (search_start + num_bytes > exclude_start &&
4441             search_start < exclude_start + exclude_nr)) {
4442            search_start = exclude_start + exclude_nr;
4443
4444            btrfs_add_free_space(block_group, offset, num_bytes);
4445            /*
4446             * if search_start is still in this block group
4447             * then we just re-search this block group
4448             */
4449            if (search_start >= block_group->key.objectid &&
4450                search_start < (block_group->key.objectid +
4451                        block_group->key.offset))
4452                goto have_block_group;
4453            goto loop;
4454        }
4455
4456        ins->objectid = search_start;
4457        ins->offset = num_bytes;
4458
4459        if (offset < search_start)
4460            btrfs_add_free_space(block_group, offset,
4461                         search_start - offset);
4462        BUG_ON(offset > search_start);
4463
4464        update_reserved_extents(block_group, num_bytes, 1);
4465
4466        /* we are all good, lets return */
4467        break;
4468loop:
4469        failed_cluster_refill = false;
4470        failed_alloc = false;
4471        btrfs_put_block_group(block_group);
4472    }
4473    up_read(&space_info->groups_sem);
4474
4475    /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4476     * for them to make caching progress. Also
4477     * determine the best possible bg to cache
4478     * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4479     * caching kthreads as we move along
4480     * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4481     * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4482     * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4483     * again
4484     */
4485    if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4486        (found_uncached_bg || empty_size || empty_cluster ||
4487         allowed_chunk_alloc)) {
4488        if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4489            found_uncached_bg = false;
4490            loop++;
4491            if (!ideal_cache_percent &&
4492                atomic_read(&space_info->caching_threads))
4493                goto search;
4494
4495            /*
4496             * 1 of the following 2 things have happened so far
4497             *
4498             * 1) We found an ideal block group for caching that
4499             * is mostly full and will cache quickly, so we might
4500             * as well wait for it.
4501             *
4502             * 2) We searched for cached only and we didn't find
4503             * anything, and we didn't start any caching kthreads
4504             * either, so chances are we will loop through and
4505             * start a couple caching kthreads, and then come back
4506             * around and just wait for them. This will be slower
4507             * because we will have 2 caching kthreads reading at
4508             * the same time when we could have just started one
4509             * and waited for it to get far enough to give us an
4510             * allocation, so go ahead and go to the wait caching
4511             * loop.
4512             */
4513            loop = LOOP_CACHING_WAIT;
4514            search_start = ideal_cache_offset;
4515            ideal_cache_percent = 0;
4516            goto ideal_cache;
4517        } else if (loop == LOOP_FIND_IDEAL) {
4518            /*
4519             * Didn't find a uncached bg, wait on anything we find
4520             * next.
4521             */
4522            loop = LOOP_CACHING_WAIT;
4523            goto search;
4524        }
4525
4526        if (loop < LOOP_CACHING_WAIT) {
4527            loop++;
4528            goto search;
4529        }
4530
4531        if (loop == LOOP_ALLOC_CHUNK) {
4532            empty_size = 0;
4533            empty_cluster = 0;
4534        }
4535
4536        if (allowed_chunk_alloc) {
4537            ret = do_chunk_alloc(trans, root, num_bytes +
4538                         2 * 1024 * 1024, data, 1);
4539            allowed_chunk_alloc = 0;
4540            done_chunk_alloc = 1;
4541        } else if (!done_chunk_alloc) {
4542            space_info->force_alloc = 1;
4543        }
4544
4545        if (loop < LOOP_NO_EMPTY_SIZE) {
4546            loop++;
4547            goto search;
4548        }
4549        ret = -ENOSPC;
4550    } else if (!ins->objectid) {
4551        ret = -ENOSPC;
4552    }
4553
4554    /* we found what we needed */
4555    if (ins->objectid) {
4556        if (!(data & BTRFS_BLOCK_GROUP_DATA))
4557            trans->block_group = block_group->key.objectid;
4558
4559        btrfs_put_block_group(block_group);
4560        ret = 0;
4561    }
4562
4563    return ret;
4564}
4565
4566static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4567                int dump_block_groups)
4568{
4569    struct btrfs_block_group_cache *cache;
4570
4571    spin_lock(&info->lock);
4572    printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4573           (unsigned long long)(info->total_bytes - info->bytes_used -
4574                    info->bytes_pinned - info->bytes_reserved -
4575                    info->bytes_super),
4576           (info->full) ? "" : "not ");
4577    printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4578           " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4579           "\n",
4580           (unsigned long long)info->total_bytes,
4581           (unsigned long long)info->bytes_pinned,
4582           (unsigned long long)info->bytes_delalloc,
4583           (unsigned long long)info->bytes_may_use,
4584           (unsigned long long)info->bytes_used,
4585           (unsigned long long)info->bytes_root,
4586           (unsigned long long)info->bytes_super,
4587           (unsigned long long)info->bytes_reserved);
4588    spin_unlock(&info->lock);
4589
4590    if (!dump_block_groups)
4591        return;
4592
4593    down_read(&info->groups_sem);
4594    list_for_each_entry(cache, &info->block_groups, list) {
4595        spin_lock(&cache->lock);
4596        printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4597               "%llu pinned %llu reserved\n",
4598               (unsigned long long)cache->key.objectid,
4599               (unsigned long long)cache->key.offset,
4600               (unsigned long long)btrfs_block_group_used(&cache->item),
4601               (unsigned long long)cache->pinned,
4602               (unsigned long long)cache->reserved);
4603        btrfs_dump_free_space(cache, bytes);
4604        spin_unlock(&cache->lock);
4605    }
4606    up_read(&info->groups_sem);
4607}
4608
4609int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
4610             struct btrfs_root *root,
4611             u64 num_bytes, u64 min_alloc_size,
4612             u64 empty_size, u64 hint_byte,
4613             u64 search_end, struct btrfs_key *ins,
4614             u64 data)
4615{
4616    int ret;
4617    u64 search_start = 0;
4618
4619    data = btrfs_get_alloc_profile(root, data);
4620again:
4621    /*
4622     * the only place that sets empty_size is btrfs_realloc_node, which
4623     * is not called recursively on allocations
4624     */
4625    if (empty_size || root->ref_cows)
4626        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4627                     num_bytes + 2 * 1024 * 1024, data, 0);
4628
4629    WARN_ON(num_bytes < root->sectorsize);
4630    ret = find_free_extent(trans, root, num_bytes, empty_size,
4631                   search_start, search_end, hint_byte, ins,
4632                   trans->alloc_exclude_start,
4633                   trans->alloc_exclude_nr, data);
4634
4635    if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4636        num_bytes = num_bytes >> 1;
4637        num_bytes = num_bytes & ~(root->sectorsize - 1);
4638        num_bytes = max(num_bytes, min_alloc_size);
4639        do_chunk_alloc(trans, root->fs_info->extent_root,
4640                   num_bytes, data, 1);
4641        goto again;
4642    }
4643    if (ret == -ENOSPC) {
4644        struct btrfs_space_info *sinfo;
4645
4646        sinfo = __find_space_info(root->fs_info, data);
4647        printk(KERN_ERR "btrfs allocation failed flags %llu, "
4648               "wanted %llu\n", (unsigned long long)data,
4649               (unsigned long long)num_bytes);
4650        dump_space_info(sinfo, num_bytes, 1);
4651    }
4652
4653    return ret;
4654}
4655
4656int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4657{
4658    struct btrfs_block_group_cache *cache;
4659    int ret = 0;
4660
4661    cache = btrfs_lookup_block_group(root->fs_info, start);
4662    if (!cache) {
4663        printk(KERN_ERR "Unable to find block group for %llu\n",
4664               (unsigned long long)start);
4665        return -ENOSPC;
4666    }
4667
4668    ret = btrfs_discard_extent(root, start, len);
4669
4670    btrfs_add_free_space(cache, start, len);
4671    update_reserved_extents(cache, len, 0);
4672    btrfs_put_block_group(cache);
4673
4674    return ret;
4675}
4676
4677static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4678                      struct btrfs_root *root,
4679                      u64 parent, u64 root_objectid,
4680                      u64 flags, u64 owner, u64 offset,
4681                      struct btrfs_key *ins, int ref_mod)
4682{
4683    int ret;
4684    struct btrfs_fs_info *fs_info = root->fs_info;
4685    struct btrfs_extent_item *extent_item;
4686    struct btrfs_extent_inline_ref *iref;
4687    struct btrfs_path *path;
4688    struct extent_buffer *leaf;
4689    int type;
4690    u32 size;
4691
4692    if (parent > 0)
4693        type = BTRFS_SHARED_DATA_REF_KEY;
4694    else
4695        type = BTRFS_EXTENT_DATA_REF_KEY;
4696
4697    size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4698
4699    path = btrfs_alloc_path();
4700    BUG_ON(!path);
4701
4702    path->leave_spinning = 1;
4703    ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4704                      ins, size);
4705    BUG_ON(ret);
4706
4707    leaf = path->nodes[0];
4708    extent_item = btrfs_item_ptr(leaf, path->slots[0],
4709                     struct btrfs_extent_item);
4710    btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4711    btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4712    btrfs_set_extent_flags(leaf, extent_item,
4713                   flags | BTRFS_EXTENT_FLAG_DATA);
4714
4715    iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4716    btrfs_set_extent_inline_ref_type(leaf, iref, type);
4717    if (parent > 0) {
4718        struct btrfs_shared_data_ref *ref;
4719        ref = (struct btrfs_shared_data_ref *)(iref + 1);
4720        btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4721        btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4722    } else {
4723        struct btrfs_extent_data_ref *ref;
4724        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4725        btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4726        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4727        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4728        btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4729    }
4730
4731    btrfs_mark_buffer_dirty(path->nodes[0]);
4732    btrfs_free_path(path);
4733
4734    ret = update_block_group(trans, root, ins->objectid, ins->offset,
4735                 1, 0);
4736    if (ret) {
4737        printk(KERN_ERR "btrfs update block group failed for %llu "
4738               "%llu\n", (unsigned long long)ins->objectid,
4739               (unsigned long long)ins->offset);
4740        BUG();
4741    }
4742    return ret;
4743}
4744
4745static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4746                     struct btrfs_root *root,
4747                     u64 parent, u64 root_objectid,
4748                     u64 flags, struct btrfs_disk_key *key,
4749                     int level, struct btrfs_key *ins)
4750{
4751    int ret;
4752    struct btrfs_fs_info *fs_info = root->fs_info;
4753    struct btrfs_extent_item *extent_item;
4754    struct btrfs_tree_block_info *block_info;
4755    struct btrfs_extent_inline_ref *iref;
4756    struct btrfs_path *path;
4757    struct extent_buffer *leaf;
4758    u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
4759
4760    path = btrfs_alloc_path();
4761    BUG_ON(!path);
4762
4763    path->leave_spinning = 1;
4764    ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4765                      ins, size);
4766    BUG_ON(ret);
4767
4768    leaf = path->nodes[0];
4769    extent_item = btrfs_item_ptr(leaf, path->slots[0],
4770                     struct btrfs_extent_item);
4771    btrfs_set_extent_refs(leaf, extent_item, 1);
4772    btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4773    btrfs_set_extent_flags(leaf, extent_item,
4774                   flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4775    block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4776
4777    btrfs_set_tree_block_key(leaf, block_info, key);
4778    btrfs_set_tree_block_level(leaf, block_info, level);
4779
4780    iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4781    if (parent > 0) {
4782        BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4783        btrfs_set_extent_inline_ref_type(leaf, iref,
4784                         BTRFS_SHARED_BLOCK_REF_KEY);
4785        btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4786    } else {
4787        btrfs_set_extent_inline_ref_type(leaf, iref,
4788                         BTRFS_TREE_BLOCK_REF_KEY);
4789        btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
4790    }
4791
4792    btrfs_mark_buffer_dirty(leaf);
4793    btrfs_free_path(path);
4794
4795    ret = update_block_group(trans, root, ins->objectid, ins->offset,
4796                 1, 0);
4797    if (ret) {
4798        printk(KERN_ERR "btrfs update block group failed for %llu "
4799               "%llu\n", (unsigned long long)ins->objectid,
4800               (unsigned long long)ins->offset);
4801        BUG();
4802    }
4803    return ret;
4804}
4805
4806int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4807                     struct btrfs_root *root,
4808                     u64 root_objectid, u64 owner,
4809                     u64 offset, struct btrfs_key *ins)
4810{
4811    int ret;
4812
4813    BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
4814
4815    ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
4816                     0, root_objectid, owner, offset,
4817                     BTRFS_ADD_DELAYED_EXTENT, NULL);
4818    return ret;
4819}
4820
4821/*
4822 * this is used by the tree logging recovery code. It records that
4823 * an extent has been allocated and makes sure to clear the free
4824 * space cache bits as well
4825 */
4826int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4827                   struct btrfs_root *root,
4828                   u64 root_objectid, u64 owner, u64 offset,
4829                   struct btrfs_key *ins)
4830{
4831    int ret;
4832    struct btrfs_block_group_cache *block_group;
4833    struct btrfs_caching_control *caching_ctl;
4834    u64 start = ins->objectid;
4835    u64 num_bytes = ins->offset;
4836
4837    block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4838    cache_block_group(block_group);
4839    caching_ctl = get_caching_control(block_group);
4840
4841    if (!caching_ctl) {
4842        BUG_ON(!block_group_cache_done(block_group));
4843        ret = btrfs_remove_free_space(block_group, start, num_bytes);
4844        BUG_ON(ret);
4845    } else {
4846        mutex_lock(&caching_ctl->mutex);
4847
4848        if (start >= caching_ctl->progress) {
4849            ret = add_excluded_extent(root, start, num_bytes);
4850            BUG_ON(ret);
4851        } else if (start + num_bytes <= caching_ctl->progress) {
4852            ret = btrfs_remove_free_space(block_group,
4853                              start, num_bytes);
4854            BUG_ON(ret);
4855        } else {
4856            num_bytes = caching_ctl->progress - start;
4857            ret = btrfs_remove_free_space(block_group,
4858                              start, num_bytes);
4859            BUG_ON(ret);
4860
4861            start = caching_ctl->progress;
4862            num_bytes = ins->objectid + ins->offset -
4863                    caching_ctl->progress;
4864            ret = add_excluded_extent(root, start, num_bytes);
4865            BUG_ON(ret);
4866        }
4867
4868        mutex_unlock(&caching_ctl->mutex);
4869        put_caching_control(caching_ctl);
4870    }
4871
4872    update_reserved_extents(block_group, ins->offset, 1);
4873    btrfs_put_block_group(block_group);
4874    ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4875                     0, owner, offset, ins, 1);
4876    return ret;
4877}
4878
4879/*
4880 * finds a free extent and does all the dirty work required for allocation
4881 * returns the key for the extent through ins, and a tree buffer for
4882 * the first block of the extent through buf.
4883 *
4884 * returns 0 if everything worked, non-zero otherwise.
4885 */
4886static int alloc_tree_block(struct btrfs_trans_handle *trans,
4887                struct btrfs_root *root,
4888                u64 num_bytes, u64 parent, u64 root_objectid,
4889                struct btrfs_disk_key *key, int level,
4890                u64 empty_size, u64 hint_byte, u64 search_end,
4891                struct btrfs_key *ins)
4892{
4893    int ret;
4894    u64 flags = 0;
4895
4896    ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4897                   empty_size, hint_byte, search_end,
4898                   ins, 0);
4899    if (ret)
4900        return ret;
4901
4902    if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4903        if (parent == 0)
4904            parent = ins->objectid;
4905        flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4906    } else
4907        BUG_ON(parent > 0);
4908
4909    if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4910        struct btrfs_delayed_extent_op *extent_op;
4911        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4912        BUG_ON(!extent_op);
4913        if (key)
4914            memcpy(&extent_op->key, key, sizeof(extent_op->key));
4915        else
4916            memset(&extent_op->key, 0, sizeof(extent_op->key));
4917        extent_op->flags_to_set = flags;
4918        extent_op->update_key = 1;
4919        extent_op->update_flags = 1;
4920        extent_op->is_data = 0;
4921
4922        ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4923                    ins->offset, parent, root_objectid,
4924                    level, BTRFS_ADD_DELAYED_EXTENT,
4925                    extent_op);
4926        BUG_ON(ret);
4927    }
4928
4929    if (root_objectid == root->root_key.objectid) {
4930        u64 used;
4931        spin_lock(&root->node_lock);
4932        used = btrfs_root_used(&root->root_item) + num_bytes;
4933        btrfs_set_root_used(&root->root_item, used);
4934        spin_unlock(&root->node_lock);
4935    }
4936    return ret;
4937}
4938
4939struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4940                        struct btrfs_root *root,
4941                        u64 bytenr, u32 blocksize,
4942                        int level)
4943{
4944    struct extent_buffer *buf;
4945
4946    buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
4947    if (!buf)
4948        return ERR_PTR(-ENOMEM);
4949    btrfs_set_header_generation(buf, trans->transid);
4950    btrfs_set_buffer_lockdep_class(buf, level);
4951    btrfs_tree_lock(buf);
4952    clean_tree_block(trans, root, buf);
4953
4954    btrfs_set_lock_blocking(buf);
4955    btrfs_set_buffer_uptodate(buf);
4956
4957    if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4958        /*
4959         * we allow two log transactions at a time, use different
4960         * EXENT bit to differentiate dirty pages.
4961         */
4962        if (root->log_transid % 2 == 0)
4963            set_extent_dirty(&root->dirty_log_pages, buf->start,
4964                    buf->start + buf->len - 1, GFP_NOFS);
4965        else
4966            set_extent_new(&root->dirty_log_pages, buf->start,
4967                    buf->start + buf->len - 1, GFP_NOFS);
4968    } else {
4969        set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4970             buf->start + buf->len - 1, GFP_NOFS);
4971    }
4972    trans->blocks_used++;
4973    /* this returns a buffer locked for blocking */
4974    return buf;
4975}
4976
4977/*
4978 * helper function to allocate a block for a given tree
4979 * returns the tree buffer or NULL.
4980 */
4981struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4982                    struct btrfs_root *root, u32 blocksize,
4983                    u64 parent, u64 root_objectid,
4984                    struct btrfs_disk_key *key, int level,
4985                    u64 hint, u64 empty_size)
4986{
4987    struct btrfs_key ins;
4988    int ret;
4989    struct extent_buffer *buf;
4990
4991    ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
4992                   key, level, empty_size, hint, (u64)-1, &ins);
4993    if (ret) {
4994        BUG_ON(ret > 0);
4995        return ERR_PTR(ret);
4996    }
4997
4998    buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4999                    blocksize, level);
5000    return buf;
5001}
5002
5003struct walk_control {
5004    u64 refs[BTRFS_MAX_LEVEL];
5005    u64 flags[BTRFS_MAX_LEVEL];
5006    struct btrfs_key update_progress;
5007    int stage;
5008    int level;
5009    int shared_level;
5010    int update_ref;
5011    int keep_locks;
5012    int reada_slot;
5013    int reada_count;
5014};
5015
5016#define DROP_REFERENCE 1
5017#define UPDATE_BACKREF 2
5018
5019static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5020                     struct btrfs_root *root,
5021                     struct walk_control *wc,
5022                     struct btrfs_path *path)
5023{
5024    u64 bytenr;
5025    u64 generation;
5026    u64 refs;
5027    u64 flags;
5028    u64 last = 0;
5029    u32 nritems;
5030    u32 blocksize;
5031    struct btrfs_key key;
5032    struct extent_buffer *eb;
5033    int ret;
5034    int slot;
5035    int nread = 0;
5036
5037    if (path->slots[wc->level] < wc->reada_slot) {
5038        wc->reada_count = wc->reada_count * 2 / 3;
5039        wc->reada_count = max(wc->reada_count, 2);
5040    } else {
5041        wc->reada_count = wc->reada_count * 3 / 2;
5042        wc->reada_count = min_t(int, wc->reada_count,
5043                    BTRFS_NODEPTRS_PER_BLOCK(root));
5044    }
5045
5046    eb = path->nodes[wc->level];
5047    nritems = btrfs_header_nritems(eb);
5048    blocksize = btrfs_level_size(root, wc->level - 1);
5049
5050    for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5051        if (nread >= wc->reada_count)
5052            break;
5053
5054        cond_resched();
5055        bytenr = btrfs_node_blockptr(eb, slot);
5056        generation = btrfs_node_ptr_generation(eb, slot);
5057
5058        if (slot == path->slots[wc->level])
5059            goto reada;
5060
5061        if (wc->stage == UPDATE_BACKREF &&
5062            generation <= root->root_key.offset)
5063            continue;
5064
5065        /* We don't lock the tree block, it's OK to be racy here */
5066        ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5067                           &refs, &flags);
5068        BUG_ON(ret);
5069        BUG_ON(refs == 0);
5070
5071        if (wc->stage == DROP_REFERENCE) {
5072            if (refs == 1)
5073                goto reada;
5074
5075            if (wc->level == 1 &&
5076                (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5077                continue;
5078            if (!wc->update_ref ||
5079                generation <= root->root_key.offset)
5080                continue;
5081            btrfs_node_key_to_cpu(eb, &key, slot);
5082            ret = btrfs_comp_cpu_keys(&key,
5083                          &wc->update_progress);
5084            if (ret < 0)
5085                continue;
5086        } else {
5087            if (wc->level == 1 &&
5088                (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5089                continue;
5090        }
5091reada:
5092        ret = readahead_tree_block(root, bytenr, blocksize,
5093                       generation);
5094        if (ret)
5095            break;
5096        last = bytenr + blocksize;
5097        nread++;
5098    }
5099    wc->reada_slot = slot;
5100}
5101
5102/*
5103 * hepler to process tree block while walking down the tree.
5104 *
5105 * when wc->stage == UPDATE_BACKREF, this function updates
5106 * back refs for pointers in the block.
5107 *
5108 * NOTE: return value 1 means we should stop walking down.
5109 */
5110static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5111                   struct btrfs_root *root,
5112                   struct btrfs_path *path,
5113                   struct walk_control *wc, int lookup_info)
5114{
5115    int level = wc->level;
5116    struct extent_buffer *eb = path->nodes[level];
5117    u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5118    int ret;
5119
5120    if (wc->stage == UPDATE_BACKREF &&
5121        btrfs_header_owner(eb) != root->root_key.objectid)
5122        return 1;
5123
5124    /*
5125     * when reference count of tree block is 1, it won't increase
5126     * again. once full backref flag is set, we never clear it.
5127     */
5128    if (lookup_info &&
5129        ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5130         (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5131        BUG_ON(!path->locks[level]);
5132        ret = btrfs_lookup_extent_info(trans, root,
5133                           eb->start, eb->len,
5134                           &wc->refs[level],
5135                           &wc->flags[level]);
5136        BUG_ON(ret);
5137        BUG_ON(wc->refs[level] == 0);
5138    }
5139
5140    if (wc->stage == DROP_REFERENCE) {
5141        if (wc->refs[level] > 1)
5142            return 1;
5143
5144        if (path->locks[level] && !wc->keep_locks) {
5145            btrfs_tree_unlock(eb);
5146            path->locks[level] = 0;
5147        }
5148        return 0;
5149    }
5150
5151    /* wc->stage == UPDATE_BACKREF */
5152    if (!(wc->flags[level] & flag)) {
5153        BUG_ON(!path->locks[level]);
5154        ret = btrfs_inc_ref(trans, root, eb, 1);
5155        BUG_ON(ret);
5156        ret = btrfs_dec_ref(trans, root, eb, 0);
5157        BUG_ON(ret);
5158        ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5159                          eb->len, flag, 0);
5160        BUG_ON(ret);
5161        wc->flags[level] |= flag;
5162    }
5163
5164    /*
5165     * the block is shared by multiple trees, so it's not good to
5166     * keep the tree lock
5167     */
5168    if (path->locks[level] && level > 0) {
5169        btrfs_tree_unlock(eb);
5170        path->locks[level] = 0;
5171    }
5172    return 0;
5173}
5174
5175/*
5176 * hepler to process tree block pointer.
5177 *
5178 * when wc->stage == DROP_REFERENCE, this function checks
5179 * reference count of the block pointed to. if the block
5180 * is shared and we need update back refs for the subtree
5181 * rooted at the block, this function changes wc->stage to
5182 * UPDATE_BACKREF. if the block is shared and there is no
5183 * need to update back, this function drops the reference
5184 * to the block.
5185 *
5186 * NOTE: return value 1 means we should stop walking down.
5187 */
5188static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5189                 struct btrfs_root *root,
5190                 struct btrfs_path *path,
5191                 struct walk_control *wc, int *lookup_info)
5192{
5193    u64 bytenr;
5194    u64 generation;
5195    u64 parent;
5196    u32 blocksize;
5197    struct btrfs_key key;
5198    struct extent_buffer *next;
5199    int level = wc->level;
5200    int reada = 0;
5201    int ret = 0;
5202
5203    generation = btrfs_node_ptr_generation(path->nodes[level],
5204                           path->slots[level]);
5205    /*
5206     * if the lower level block was created before the snapshot
5207     * was created, we know there is no need to update back refs
5208     * for the subtree
5209     */
5210    if (wc->stage == UPDATE_BACKREF &&
5211        generation <= root->root_key.offset) {
5212        *lookup_info = 1;
5213        return 1;
5214    }
5215
5216    bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5217    blocksize = btrfs_level_size(root, level - 1);
5218
5219    next = btrfs_find_tree_block(root, bytenr, blocksize);
5220    if (!next) {
5221        next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5222        if (!next)
5223            return -ENOMEM;
5224        reada = 1;
5225    }
5226    btrfs_tree_lock(next);
5227    btrfs_set_lock_blocking(next);
5228
5229    ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5230                       &wc->refs[level - 1],
5231                       &wc->flags[level - 1]);
5232    BUG_ON(ret);
5233    BUG_ON(wc->refs[level - 1] == 0);
5234    *lookup_info = 0;
5235
5236    if (wc->stage == DROP_REFERENCE) {
5237        if (wc->refs[level - 1] > 1) {
5238            if (level == 1 &&
5239                (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5240                goto skip;
5241
5242            if (!wc->update_ref ||
5243                generation <= root->root_key.offset)
5244                goto skip;
5245
5246            btrfs_node_key_to_cpu(path->nodes[level], &key,
5247                          path->slots[level]);
5248            ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
5249            if (ret < 0)
5250                goto skip;
5251
5252            wc->stage = UPDATE_BACKREF;
5253            wc->shared_level = level - 1;
5254        }
5255    } else {
5256        if (level == 1 &&
5257            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5258            goto skip;
5259    }
5260
5261    if (!btrfs_buffer_uptodate(next, generation)) {
5262        btrfs_tree_unlock(next);
5263        free_extent_buffer(next);
5264        next = NULL;
5265        *lookup_info = 1;
5266    }
5267
5268    if (!next) {
5269        if (reada && level == 1)
5270            reada_walk_down(trans, root, wc, path);
5271        next = read_tree_block(root, bytenr, blocksize, generation);
5272        btrfs_tree_lock(next);
5273        btrfs_set_lock_blocking(next);
5274    }
5275
5276    level--;
5277    BUG_ON(level != btrfs_header_level(next));
5278    path->nodes[level] = next;
5279    path->slots[level] = 0;
5280    path->locks[level] = 1;
5281    wc->level = level;
5282    if (wc->level == 1)
5283        wc->reada_slot = 0;
5284    return 0;
5285skip:
5286    wc->refs[level - 1] = 0;
5287    wc->flags[level - 1] = 0;
5288    if (wc->stage == DROP_REFERENCE) {
5289        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5290            parent = path->nodes[level]->start;
5291        } else {
5292            BUG_ON(root->root_key.objectid !=
5293                   btrfs_header_owner(path->nodes[level]));
5294            parent = 0;
5295        }
5296
5297        ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
5298                    root->root_key.objectid, level - 1, 0);
5299        BUG_ON(ret);
5300    }
5301    btrfs_tree_unlock(next);
5302    free_extent_buffer(next);
5303    *lookup_info = 1;
5304    return 1;
5305}
5306
5307/*
5308 * hepler to process tree block while walking up the tree.
5309 *
5310 * when wc->stage == DROP_REFERENCE, this function drops
5311 * reference count on the block.
5312 *
5313 * when wc->stage == UPDATE_BACKREF, this function changes
5314 * wc->stage back to DROP_REFERENCE if we changed wc->stage
5315 * to UPDATE_BACKREF previously while processing the block.
5316 *
5317 * NOTE: return value 1 means we should stop walking up.
5318 */
5319static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5320                 struct btrfs_root *root,
5321                 struct btrfs_path *path,
5322                 struct walk_control *wc)
5323{
5324    int ret = 0;
5325    int level = wc->level;
5326    struct extent_buffer *eb = path->nodes[level];
5327    u64 parent = 0;
5328
5329    if (wc->stage == UPDATE_BACKREF) {
5330        BUG_ON(wc->shared_level < level);
5331        if (level < wc->shared_level)
5332            goto out;
5333
5334        ret = find_next_key(path, level + 1, &wc->update_progress);
5335        if (ret > 0)
5336            wc->update_ref = 0;
5337
5338        wc->stage = DROP_REFERENCE;
5339        wc->shared_level = -1;
5340        path->slots[level] = 0;
5341
5342        /*
5343         * check reference count again if the block isn't locked.
5344         * we should start walking down the tree again if reference
5345         * count is one.
5346         */
5347        if (!path->locks[level]) {
5348            BUG_ON(level == 0);
5349            btrfs_tree_lock(eb);
5350            btrfs_set_lock_blocking(eb);
5351            path->locks[level] = 1;
5352
5353            ret = btrfs_lookup_extent_info(trans, root,
5354                               eb->start, eb->len,
5355                               &wc->refs[level],
5356                               &wc->flags[level]);
5357            BUG_ON(ret);
5358            BUG_ON(wc->refs[level] == 0);
5359            if (wc->refs[level] == 1) {
5360                btrfs_tree_unlock(eb);
5361                path->locks[level] = 0;
5362                return 1;
5363            }
5364        }
5365    }
5366
5367    /* wc->stage == DROP_REFERENCE */
5368    BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5369
5370    if (wc->refs[level] == 1) {
5371        if (level == 0) {
5372            if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5373                ret = btrfs_dec_ref(trans, root, eb, 1);
5374            else
5375                ret = btrfs_dec_ref(trans, root, eb, 0);
5376            BUG_ON(ret);
5377        }
5378        /* make block locked assertion in clean_tree_block happy */
5379        if (!path->locks[level] &&
5380            btrfs_header_generation(eb) == trans->transid) {
5381            btrfs_tree_lock(eb);
5382            btrfs_set_lock_blocking(eb);
5383            path->locks[level] = 1;
5384        }
5385        clean_tree_block(trans, root, eb);
5386    }
5387
5388    if (eb == root->node) {
5389        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5390            parent = eb->start;
5391        else
5392            BUG_ON(root->root_key.objectid !=
5393                   btrfs_header_owner(eb));
5394    } else {
5395        if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5396            parent = path->nodes[level + 1]->start;
5397        else
5398            BUG_ON(root->root_key.objectid !=
5399                   btrfs_header_owner(path->nodes[level + 1]));
5400    }
5401
5402    ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
5403                root->root_key.objectid, level, 0);
5404    BUG_ON(ret);
5405out:
5406    wc->refs[level] = 0;
5407    wc->flags[level] = 0;
5408    return ret;
5409}
5410
5411static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5412                   struct btrfs_root *root,
5413                   struct btrfs_path *path,
5414                   struct walk_control *wc)
5415{
5416    int level = wc->level;
5417    int lookup_info = 1;
5418    int ret;
5419
5420    while (level >= 0) {
5421        ret = walk_down_proc(trans, root, path, wc, lookup_info);
5422        if (ret > 0)
5423            break;
5424
5425        if (level == 0)
5426            break;
5427
5428        if (path->slots[level] >=
5429            btrfs_header_nritems(path->nodes[level]))
5430            break;
5431
5432        ret = do_walk_down(trans, root, path, wc, &lookup_info);
5433        if (ret > 0) {
5434            path->slots[level]++;
5435            continue;
5436        } else if (ret < 0)
5437            return ret;
5438        level = wc->level;
5439    }
5440    return 0;
5441}
5442
5443static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5444                 struct btrfs_root *root,
5445                 struct btrfs_path *path,
5446                 struct walk_control *wc, int max_level)
5447{
5448    int level = wc->level;
5449    int ret;
5450
5451    path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5452    while (level < max_level && path->nodes[level]) {
5453        wc->level = level;
5454        if (path->slots[level] + 1 <
5455            btrfs_header_nritems(path->nodes[level])) {
5456            path->slots[level]++;
5457            return 0;
5458        } else {
5459            ret = walk_up_proc(trans, root, path, wc);
5460            if (ret > 0)
5461                return 0;
5462
5463            if (path->locks[level]) {
5464                btrfs_tree_unlock(path->nodes[level]);
5465                path->locks[level] = 0;
5466            }
5467            free_extent_buffer(path->nodes[level]);
5468            path->nodes[level] = NULL;
5469            level++;
5470        }
5471    }
5472    return 1;
5473}
5474
5475/*
5476 * drop a subvolume tree.
5477 *
5478 * this function traverses the tree freeing any blocks that only
5479 * referenced by the tree.
5480 *
5481 * when a shared tree block is found. this function decreases its
5482 * reference count by one. if update_ref is true, this function
5483 * also make sure backrefs for the shared block and all lower level
5484 * blocks are properly updated.
5485 */
5486int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5487{
5488    struct btrfs_path *path;
5489    struct btrfs_trans_handle *trans;
5490    struct btrfs_root *tree_root = root->fs_info->tree_root;
5491    struct btrfs_root_item *root_item = &root->root_item;
5492    struct walk_control *wc;
5493    struct btrfs_key key;
5494    int err = 0;
5495    int ret;
5496    int level;
5497
5498    path = btrfs_alloc_path();
5499    BUG_ON(!path);
5500
5501    wc = kzalloc(sizeof(*wc), GFP_NOFS);
5502    BUG_ON(!wc);
5503
5504    trans = btrfs_start_transaction(tree_root, 1);
5505
5506    if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5507        level = btrfs_header_level(root->node);
5508        path->nodes[level] = btrfs_lock_root_node(root);
5509        btrfs_set_lock_blocking(path->nodes[level]);
5510        path->slots[level] = 0;
5511        path->locks[level] = 1;
5512        memset(&wc->update_progress, 0,
5513               sizeof(wc->update_progress));
5514    } else {
5515        btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5516        memcpy(&wc->update_progress, &key,
5517               sizeof(wc->update_progress));
5518
5519        level = root_item->drop_level;
5520        BUG_ON(level == 0);
5521        path->lowest_level = level;
5522        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5523        path->lowest_level = 0;
5524        if (ret < 0) {
5525            err = ret;
5526            goto out;
5527        }
5528