Root/fs/btrfs/extent_io.c

1#include <linux/bitops.h>
2#include <linux/slab.h>
3#include <linux/bio.h>
4#include <linux/mm.h>
5#include <linux/pagemap.h>
6#include <linux/page-flags.h>
7#include <linux/module.h>
8#include <linux/spinlock.h>
9#include <linux/blkdev.h>
10#include <linux/swap.h>
11#include <linux/writeback.h>
12#include <linux/pagevec.h>
13#include "extent_io.h"
14#include "extent_map.h"
15#include "compat.h"
16#include "ctree.h"
17#include "btrfs_inode.h"
18
19static struct kmem_cache *extent_state_cache;
20static struct kmem_cache *extent_buffer_cache;
21
22static LIST_HEAD(buffers);
23static LIST_HEAD(states);
24
25#define LEAK_DEBUG 0
26#if LEAK_DEBUG
27static DEFINE_SPINLOCK(leak_lock);
28#endif
29
30#define BUFFER_LRU_MAX 64
31
32struct tree_entry {
33    u64 start;
34    u64 end;
35    struct rb_node rb_node;
36};
37
38struct extent_page_data {
39    struct bio *bio;
40    struct extent_io_tree *tree;
41    get_extent_t *get_extent;
42
43    /* tells writepage not to lock the state bits for this range
44     * it still does the unlocking
45     */
46    unsigned int extent_locked:1;
47
48    /* tells the submit_bio code to use a WRITE_SYNC */
49    unsigned int sync_io:1;
50};
51
52int __init extent_io_init(void)
53{
54    extent_state_cache = kmem_cache_create("extent_state",
55            sizeof(struct extent_state), 0,
56            SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
57    if (!extent_state_cache)
58        return -ENOMEM;
59
60    extent_buffer_cache = kmem_cache_create("extent_buffers",
61            sizeof(struct extent_buffer), 0,
62            SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
63    if (!extent_buffer_cache)
64        goto free_state_cache;
65    return 0;
66
67free_state_cache:
68    kmem_cache_destroy(extent_state_cache);
69    return -ENOMEM;
70}
71
72void extent_io_exit(void)
73{
74    struct extent_state *state;
75    struct extent_buffer *eb;
76
77    while (!list_empty(&states)) {
78        state = list_entry(states.next, struct extent_state, leak_list);
79        printk(KERN_ERR "btrfs state leak: start %llu end %llu "
80               "state %lu in tree %p refs %d\n",
81               (unsigned long long)state->start,
82               (unsigned long long)state->end,
83               state->state, state->tree, atomic_read(&state->refs));
84        list_del(&state->leak_list);
85        kmem_cache_free(extent_state_cache, state);
86
87    }
88
89    while (!list_empty(&buffers)) {
90        eb = list_entry(buffers.next, struct extent_buffer, leak_list);
91        printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
92               "refs %d\n", (unsigned long long)eb->start,
93               eb->len, atomic_read(&eb->refs));
94        list_del(&eb->leak_list);
95        kmem_cache_free(extent_buffer_cache, eb);
96    }
97    if (extent_state_cache)
98        kmem_cache_destroy(extent_state_cache);
99    if (extent_buffer_cache)
100        kmem_cache_destroy(extent_buffer_cache);
101}
102
103void extent_io_tree_init(struct extent_io_tree *tree,
104              struct address_space *mapping, gfp_t mask)
105{
106    tree->state = RB_ROOT;
107    tree->buffer = RB_ROOT;
108    tree->ops = NULL;
109    tree->dirty_bytes = 0;
110    spin_lock_init(&tree->lock);
111    spin_lock_init(&tree->buffer_lock);
112    tree->mapping = mapping;
113}
114
115static struct extent_state *alloc_extent_state(gfp_t mask)
116{
117    struct extent_state *state;
118#if LEAK_DEBUG
119    unsigned long flags;
120#endif
121
122    state = kmem_cache_alloc(extent_state_cache, mask);
123    if (!state)
124        return state;
125    state->state = 0;
126    state->private = 0;
127    state->tree = NULL;
128#if LEAK_DEBUG
129    spin_lock_irqsave(&leak_lock, flags);
130    list_add(&state->leak_list, &states);
131    spin_unlock_irqrestore(&leak_lock, flags);
132#endif
133    atomic_set(&state->refs, 1);
134    init_waitqueue_head(&state->wq);
135    return state;
136}
137
138static void free_extent_state(struct extent_state *state)
139{
140    if (!state)
141        return;
142    if (atomic_dec_and_test(&state->refs)) {
143#if LEAK_DEBUG
144        unsigned long flags;
145#endif
146        WARN_ON(state->tree);
147#if LEAK_DEBUG
148        spin_lock_irqsave(&leak_lock, flags);
149        list_del(&state->leak_list);
150        spin_unlock_irqrestore(&leak_lock, flags);
151#endif
152        kmem_cache_free(extent_state_cache, state);
153    }
154}
155
156static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
157                   struct rb_node *node)
158{
159    struct rb_node **p = &root->rb_node;
160    struct rb_node *parent = NULL;
161    struct tree_entry *entry;
162
163    while (*p) {
164        parent = *p;
165        entry = rb_entry(parent, struct tree_entry, rb_node);
166
167        if (offset < entry->start)
168            p = &(*p)->rb_left;
169        else if (offset > entry->end)
170            p = &(*p)->rb_right;
171        else
172            return parent;
173    }
174
175    entry = rb_entry(node, struct tree_entry, rb_node);
176    rb_link_node(node, parent, p);
177    rb_insert_color(node, root);
178    return NULL;
179}
180
181static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
182                     struct rb_node **prev_ret,
183                     struct rb_node **next_ret)
184{
185    struct rb_root *root = &tree->state;
186    struct rb_node *n = root->rb_node;
187    struct rb_node *prev = NULL;
188    struct rb_node *orig_prev = NULL;
189    struct tree_entry *entry;
190    struct tree_entry *prev_entry = NULL;
191
192    while (n) {
193        entry = rb_entry(n, struct tree_entry, rb_node);
194        prev = n;
195        prev_entry = entry;
196
197        if (offset < entry->start)
198            n = n->rb_left;
199        else if (offset > entry->end)
200            n = n->rb_right;
201        else
202            return n;
203    }
204
205    if (prev_ret) {
206        orig_prev = prev;
207        while (prev && offset > prev_entry->end) {
208            prev = rb_next(prev);
209            prev_entry = rb_entry(prev, struct tree_entry, rb_node);
210        }
211        *prev_ret = prev;
212        prev = orig_prev;
213    }
214
215    if (next_ret) {
216        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
217        while (prev && offset < prev_entry->start) {
218            prev = rb_prev(prev);
219            prev_entry = rb_entry(prev, struct tree_entry, rb_node);
220        }
221        *next_ret = prev;
222    }
223    return NULL;
224}
225
226static inline struct rb_node *tree_search(struct extent_io_tree *tree,
227                      u64 offset)
228{
229    struct rb_node *prev = NULL;
230    struct rb_node *ret;
231
232    ret = __etree_search(tree, offset, &prev, NULL);
233    if (!ret)
234        return prev;
235    return ret;
236}
237
238static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
239                      u64 offset, struct rb_node *node)
240{
241    struct rb_root *root = &tree->buffer;
242    struct rb_node **p = &root->rb_node;
243    struct rb_node *parent = NULL;
244    struct extent_buffer *eb;
245
246    while (*p) {
247        parent = *p;
248        eb = rb_entry(parent, struct extent_buffer, rb_node);
249
250        if (offset < eb->start)
251            p = &(*p)->rb_left;
252        else if (offset > eb->start)
253            p = &(*p)->rb_right;
254        else
255            return eb;
256    }
257
258    rb_link_node(node, parent, p);
259    rb_insert_color(node, root);
260    return NULL;
261}
262
263static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
264                       u64 offset)
265{
266    struct rb_root *root = &tree->buffer;
267    struct rb_node *n = root->rb_node;
268    struct extent_buffer *eb;
269
270    while (n) {
271        eb = rb_entry(n, struct extent_buffer, rb_node);
272        if (offset < eb->start)
273            n = n->rb_left;
274        else if (offset > eb->start)
275            n = n->rb_right;
276        else
277            return eb;
278    }
279    return NULL;
280}
281
282static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
283             struct extent_state *other)
284{
285    if (tree->ops && tree->ops->merge_extent_hook)
286        tree->ops->merge_extent_hook(tree->mapping->host, new,
287                         other);
288}
289
290/*
291 * utility function to look for merge candidates inside a given range.
292 * Any extents with matching state are merged together into a single
293 * extent in the tree. Extents with EXTENT_IO in their state field
294 * are not merged because the end_io handlers need to be able to do
295 * operations on them without sleeping (or doing allocations/splits).
296 *
297 * This should be called with the tree lock held.
298 */
299static int merge_state(struct extent_io_tree *tree,
300               struct extent_state *state)
301{
302    struct extent_state *other;
303    struct rb_node *other_node;
304
305    if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
306        return 0;
307
308    other_node = rb_prev(&state->rb_node);
309    if (other_node) {
310        other = rb_entry(other_node, struct extent_state, rb_node);
311        if (other->end == state->start - 1 &&
312            other->state == state->state) {
313            merge_cb(tree, state, other);
314            state->start = other->start;
315            other->tree = NULL;
316            rb_erase(&other->rb_node, &tree->state);
317            free_extent_state(other);
318        }
319    }
320    other_node = rb_next(&state->rb_node);
321    if (other_node) {
322        other = rb_entry(other_node, struct extent_state, rb_node);
323        if (other->start == state->end + 1 &&
324            other->state == state->state) {
325            merge_cb(tree, state, other);
326            other->start = state->start;
327            state->tree = NULL;
328            rb_erase(&state->rb_node, &tree->state);
329            free_extent_state(state);
330            state = NULL;
331        }
332    }
333
334    return 0;
335}
336
337static int set_state_cb(struct extent_io_tree *tree,
338             struct extent_state *state,
339             unsigned long bits)
340{
341    if (tree->ops && tree->ops->set_bit_hook) {
342        return tree->ops->set_bit_hook(tree->mapping->host,
343                           state->start, state->end,
344                           state->state, bits);
345    }
346
347    return 0;
348}
349
350static void clear_state_cb(struct extent_io_tree *tree,
351               struct extent_state *state,
352               unsigned long bits)
353{
354    if (tree->ops && tree->ops->clear_bit_hook)
355        tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
356}
357
358/*
359 * insert an extent_state struct into the tree. 'bits' are set on the
360 * struct before it is inserted.
361 *
362 * This may return -EEXIST if the extent is already there, in which case the
363 * state struct is freed.
364 *
365 * The tree lock is not taken internally. This is a utility function and
366 * probably isn't what you want to call (see set/clear_extent_bit).
367 */
368static int insert_state(struct extent_io_tree *tree,
369            struct extent_state *state, u64 start, u64 end,
370            int bits)
371{
372    struct rb_node *node;
373    int ret;
374
375    if (end < start) {
376        printk(KERN_ERR "btrfs end < start %llu %llu\n",
377               (unsigned long long)end,
378               (unsigned long long)start);
379        WARN_ON(1);
380    }
381    state->start = start;
382    state->end = end;
383    ret = set_state_cb(tree, state, bits);
384    if (ret)
385        return ret;
386
387    if (bits & EXTENT_DIRTY)
388        tree->dirty_bytes += end - start + 1;
389    state->state |= bits;
390    node = tree_insert(&tree->state, end, &state->rb_node);
391    if (node) {
392        struct extent_state *found;
393        found = rb_entry(node, struct extent_state, rb_node);
394        printk(KERN_ERR "btrfs found node %llu %llu on insert of "
395               "%llu %llu\n", (unsigned long long)found->start,
396               (unsigned long long)found->end,
397               (unsigned long long)start, (unsigned long long)end);
398        free_extent_state(state);
399        return -EEXIST;
400    }
401    state->tree = tree;
402    merge_state(tree, state);
403    return 0;
404}
405
406static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
407             u64 split)
408{
409    if (tree->ops && tree->ops->split_extent_hook)
410        return tree->ops->split_extent_hook(tree->mapping->host,
411                            orig, split);
412    return 0;
413}
414
415/*
416 * split a given extent state struct in two, inserting the preallocated
417 * struct 'prealloc' as the newly created second half. 'split' indicates an
418 * offset inside 'orig' where it should be split.
419 *
420 * Before calling,
421 * the tree has 'orig' at [orig->start, orig->end]. After calling, there
422 * are two extent state structs in the tree:
423 * prealloc: [orig->start, split - 1]
424 * orig: [ split, orig->end ]
425 *
426 * The tree locks are not taken by this function. They need to be held
427 * by the caller.
428 */
429static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
430               struct extent_state *prealloc, u64 split)
431{
432    struct rb_node *node;
433
434    split_cb(tree, orig, split);
435
436    prealloc->start = orig->start;
437    prealloc->end = split - 1;
438    prealloc->state = orig->state;
439    orig->start = split;
440
441    node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
442    if (node) {
443        free_extent_state(prealloc);
444        return -EEXIST;
445    }
446    prealloc->tree = tree;
447    return 0;
448}
449
450/*
451 * utility function to clear some bits in an extent state struct.
452 * it will optionally wake up any one waiting on this state (wake == 1), or
453 * forcibly remove the state from the tree (delete == 1).
454 *
455 * If no bits are set on the state struct after clearing things, the
456 * struct is freed and removed from the tree
457 */
458static int clear_state_bit(struct extent_io_tree *tree,
459                struct extent_state *state, int bits, int wake,
460                int delete)
461{
462    int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
463    int ret = state->state & bits_to_clear;
464
465    if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
466        u64 range = state->end - state->start + 1;
467        WARN_ON(range > tree->dirty_bytes);
468        tree->dirty_bytes -= range;
469    }
470    clear_state_cb(tree, state, bits);
471    state->state &= ~bits_to_clear;
472    if (wake)
473        wake_up(&state->wq);
474    if (delete || state->state == 0) {
475        if (state->tree) {
476            clear_state_cb(tree, state, state->state);
477            rb_erase(&state->rb_node, &tree->state);
478            state->tree = NULL;
479            free_extent_state(state);
480        } else {
481            WARN_ON(1);
482        }
483    } else {
484        merge_state(tree, state);
485    }
486    return ret;
487}
488
489/*
490 * clear some bits on a range in the tree. This may require splitting
491 * or inserting elements in the tree, so the gfp mask is used to
492 * indicate which allocations or sleeping are allowed.
493 *
494 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
495 * the given range from the tree regardless of state (ie for truncate).
496 *
497 * the range [start, end] is inclusive.
498 *
499 * This takes the tree lock, and returns < 0 on error, > 0 if any of the
500 * bits were already set, or zero if none of the bits were already set.
501 */
502int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
503             int bits, int wake, int delete,
504             struct extent_state **cached_state,
505             gfp_t mask)
506{
507    struct extent_state *state;
508    struct extent_state *cached;
509    struct extent_state *prealloc = NULL;
510    struct rb_node *next_node;
511    struct rb_node *node;
512    u64 last_end;
513    int err;
514    int set = 0;
515    int clear = 0;
516
517    if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518        clear = 1;
519again:
520    if (!prealloc && (mask & __GFP_WAIT)) {
521        prealloc = alloc_extent_state(mask);
522        if (!prealloc)
523            return -ENOMEM;
524    }
525
526    spin_lock(&tree->lock);
527    if (cached_state) {
528        cached = *cached_state;
529
530        if (clear) {
531            *cached_state = NULL;
532            cached_state = NULL;
533        }
534
535        if (cached && cached->tree && cached->start == start) {
536            if (clear)
537                atomic_dec(&cached->refs);
538            state = cached;
539            goto hit_next;
540        }
541        if (clear)
542            free_extent_state(cached);
543    }
544    /*
545     * this search will find the extents that end after
546     * our range starts
547     */
548    node = tree_search(tree, start);
549    if (!node)
550        goto out;
551    state = rb_entry(node, struct extent_state, rb_node);
552hit_next:
553    if (state->start > end)
554        goto out;
555    WARN_ON(state->end < start);
556    last_end = state->end;
557
558    /*
559     * | ---- desired range ---- |
560     * | state | or
561     * | ------------- state -------------- |
562     *
563     * We need to split the extent we found, and may flip
564     * bits on second half.
565     *
566     * If the extent we found extends past our range, we
567     * just split and search again. It'll get split again
568     * the next time though.
569     *
570     * If the extent we found is inside our range, we clear
571     * the desired bit on it.
572     */
573
574    if (state->start < start) {
575        if (!prealloc)
576            prealloc = alloc_extent_state(GFP_ATOMIC);
577        err = split_state(tree, state, prealloc, start);
578        BUG_ON(err == -EEXIST);
579        prealloc = NULL;
580        if (err)
581            goto out;
582        if (state->end <= end) {
583            set |= clear_state_bit(tree, state, bits, wake,
584                           delete);
585            if (last_end == (u64)-1)
586                goto out;
587            start = last_end + 1;
588        }
589        goto search_again;
590    }
591    /*
592     * | ---- desired range ---- |
593     * | state |
594     * We need to split the extent, and clear the bit
595     * on the first half
596     */
597    if (state->start <= end && state->end > end) {
598        if (!prealloc)
599            prealloc = alloc_extent_state(GFP_ATOMIC);
600        err = split_state(tree, state, prealloc, end + 1);
601        BUG_ON(err == -EEXIST);
602        if (wake)
603            wake_up(&state->wq);
604
605        set |= clear_state_bit(tree, prealloc, bits, wake, delete);
606
607        prealloc = NULL;
608        goto out;
609    }
610
611    if (state->end < end && prealloc && !need_resched())
612        next_node = rb_next(&state->rb_node);
613    else
614        next_node = NULL;
615
616    set |= clear_state_bit(tree, state, bits, wake, delete);
617    if (last_end == (u64)-1)
618        goto out;
619    start = last_end + 1;
620    if (start <= end && next_node) {
621        state = rb_entry(next_node, struct extent_state,
622                 rb_node);
623        if (state->start == start)
624            goto hit_next;
625    }
626    goto search_again;
627
628out:
629    spin_unlock(&tree->lock);
630    if (prealloc)
631        free_extent_state(prealloc);
632
633    return set;
634
635search_again:
636    if (start > end)
637        goto out;
638    spin_unlock(&tree->lock);
639    if (mask & __GFP_WAIT)
640        cond_resched();
641    goto again;
642}
643
644static int wait_on_state(struct extent_io_tree *tree,
645             struct extent_state *state)
646        __releases(tree->lock)
647        __acquires(tree->lock)
648{
649    DEFINE_WAIT(wait);
650    prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
651    spin_unlock(&tree->lock);
652    schedule();
653    spin_lock(&tree->lock);
654    finish_wait(&state->wq, &wait);
655    return 0;
656}
657
658/*
659 * waits for one or more bits to clear on a range in the state tree.
660 * The range [start, end] is inclusive.
661 * The tree lock is taken by this function
662 */
663int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
664{
665    struct extent_state *state;
666    struct rb_node *node;
667
668    spin_lock(&tree->lock);
669again:
670    while (1) {
671        /*
672         * this search will find all the extents that end after
673         * our range starts
674         */
675        node = tree_search(tree, start);
676        if (!node)
677            break;
678
679        state = rb_entry(node, struct extent_state, rb_node);
680
681        if (state->start > end)
682            goto out;
683
684        if (state->state & bits) {
685            start = state->start;
686            atomic_inc(&state->refs);
687            wait_on_state(tree, state);
688            free_extent_state(state);
689            goto again;
690        }
691        start = state->end + 1;
692
693        if (start > end)
694            break;
695
696        if (need_resched()) {
697            spin_unlock(&tree->lock);
698            cond_resched();
699            spin_lock(&tree->lock);
700        }
701    }
702out:
703    spin_unlock(&tree->lock);
704    return 0;
705}
706
707static int set_state_bits(struct extent_io_tree *tree,
708               struct extent_state *state,
709               int bits)
710{
711    int ret;
712
713    ret = set_state_cb(tree, state, bits);
714    if (ret)
715        return ret;
716
717    if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718        u64 range = state->end - state->start + 1;
719        tree->dirty_bytes += range;
720    }
721    state->state |= bits;
722
723    return 0;
724}
725
726static void cache_state(struct extent_state *state,
727            struct extent_state **cached_ptr)
728{
729    if (cached_ptr && !(*cached_ptr)) {
730        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
731            *cached_ptr = state;
732            atomic_inc(&state->refs);
733        }
734    }
735}
736
737/*
738 * set some bits on a range in the tree. This may require allocations or
739 * sleeping, so the gfp mask is used to indicate what is allowed.
740 *
741 * If any of the exclusive bits are set, this will fail with -EEXIST if some
742 * part of the range already has the desired bits set. The start of the
743 * existing range is returned in failed_start in this case.
744 *
745 * [start, end] is inclusive This takes the tree lock.
746 */
747
748static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749              int bits, int exclusive_bits, u64 *failed_start,
750              struct extent_state **cached_state,
751              gfp_t mask)
752{
753    struct extent_state *state;
754    struct extent_state *prealloc = NULL;
755    struct rb_node *node;
756    int err = 0;
757    u64 last_start;
758    u64 last_end;
759
760again:
761    if (!prealloc && (mask & __GFP_WAIT)) {
762        prealloc = alloc_extent_state(mask);
763        if (!prealloc)
764            return -ENOMEM;
765    }
766
767    spin_lock(&tree->lock);
768    if (cached_state && *cached_state) {
769        state = *cached_state;
770        if (state->start == start && state->tree) {
771            node = &state->rb_node;
772            goto hit_next;
773        }
774    }
775    /*
776     * this search will find all the extents that end after
777     * our range starts.
778     */
779    node = tree_search(tree, start);
780    if (!node) {
781        err = insert_state(tree, prealloc, start, end, bits);
782        prealloc = NULL;
783        BUG_ON(err == -EEXIST);
784        goto out;
785    }
786    state = rb_entry(node, struct extent_state, rb_node);
787hit_next:
788    last_start = state->start;
789    last_end = state->end;
790
791    /*
792     * | ---- desired range ---- |
793     * | state |
794     *
795     * Just lock what we found and keep going
796     */
797    if (state->start == start && state->end <= end) {
798        struct rb_node *next_node;
799        if (state->state & exclusive_bits) {
800            *failed_start = state->start;
801            err = -EEXIST;
802            goto out;
803        }
804
805        err = set_state_bits(tree, state, bits);
806        if (err)
807            goto out;
808
809        cache_state(state, cached_state);
810        merge_state(tree, state);
811        if (last_end == (u64)-1)
812            goto out;
813
814        start = last_end + 1;
815        if (start < end && prealloc && !need_resched()) {
816            next_node = rb_next(node);
817            if (next_node) {
818                state = rb_entry(next_node, struct extent_state,
819                         rb_node);
820                if (state->start == start)
821                    goto hit_next;
822            }
823        }
824        goto search_again;
825    }
826
827    /*
828     * | ---- desired range ---- |
829     * | state |
830     * or
831     * | ------------- state -------------- |
832     *
833     * We need to split the extent we found, and may flip bits on
834     * second half.
835     *
836     * If the extent we found extends past our
837     * range, we just split and search again. It'll get split
838     * again the next time though.
839     *
840     * If the extent we found is inside our range, we set the
841     * desired bit on it.
842     */
843    if (state->start < start) {
844        if (state->state & exclusive_bits) {
845            *failed_start = start;
846            err = -EEXIST;
847            goto out;
848        }
849        err = split_state(tree, state, prealloc, start);
850        BUG_ON(err == -EEXIST);
851        prealloc = NULL;
852        if (err)
853            goto out;
854        if (state->end <= end) {
855            err = set_state_bits(tree, state, bits);
856            if (err)
857                goto out;
858            cache_state(state, cached_state);
859            merge_state(tree, state);
860            if (last_end == (u64)-1)
861                goto out;
862            start = last_end + 1;
863        }
864        goto search_again;
865    }
866    /*
867     * | ---- desired range ---- |
868     * | state | or | state |
869     *
870     * There's a hole, we need to insert something in it and
871     * ignore the extent we found.
872     */
873    if (state->start > start) {
874        u64 this_end;
875        if (end < last_start)
876            this_end = end;
877        else
878            this_end = last_start - 1;
879        err = insert_state(tree, prealloc, start, this_end,
880                   bits);
881        BUG_ON(err == -EEXIST);
882        if (err) {
883            prealloc = NULL;
884            goto out;
885        }
886        cache_state(prealloc, cached_state);
887        prealloc = NULL;
888        start = this_end + 1;
889        goto search_again;
890    }
891    /*
892     * | ---- desired range ---- |
893     * | state |
894     * We need to split the extent, and set the bit
895     * on the first half
896     */
897    if (state->start <= end && state->end > end) {
898        if (state->state & exclusive_bits) {
899            *failed_start = start;
900            err = -EEXIST;
901            goto out;
902        }
903        err = split_state(tree, state, prealloc, end + 1);
904        BUG_ON(err == -EEXIST);
905
906        err = set_state_bits(tree, prealloc, bits);
907        if (err) {
908            prealloc = NULL;
909            goto out;
910        }
911        cache_state(prealloc, cached_state);
912        merge_state(tree, prealloc);
913        prealloc = NULL;
914        goto out;
915    }
916
917    goto search_again;
918
919out:
920    spin_unlock(&tree->lock);
921    if (prealloc)
922        free_extent_state(prealloc);
923
924    return err;
925
926search_again:
927    if (start > end)
928        goto out;
929    spin_unlock(&tree->lock);
930    if (mask & __GFP_WAIT)
931        cond_resched();
932    goto again;
933}
934
935/* wrappers around set/clear extent bit */
936int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
937             gfp_t mask)
938{
939    return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
940                  NULL, mask);
941}
942
943int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
944            int bits, gfp_t mask)
945{
946    return set_extent_bit(tree, start, end, bits, 0, NULL,
947                  NULL, mask);
948}
949
950int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
951              int bits, gfp_t mask)
952{
953    return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
954}
955
956int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
957            struct extent_state **cached_state, gfp_t mask)
958{
959    return set_extent_bit(tree, start, end,
960                  EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
961                  0, NULL, cached_state, mask);
962}
963
964int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
965               gfp_t mask)
966{
967    return clear_extent_bit(tree, start, end,
968                EXTENT_DIRTY | EXTENT_DELALLOC |
969                EXTENT_DO_ACCOUNTING, 0, 0,
970                NULL, mask);
971}
972
973int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
974             gfp_t mask)
975{
976    return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
977                  NULL, mask);
978}
979
980static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
981               gfp_t mask)
982{
983    return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
984                NULL, mask);
985}
986
987int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
988            gfp_t mask)
989{
990    return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
991                  NULL, mask);
992}
993
994static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
995                 u64 end, struct extent_state **cached_state,
996                 gfp_t mask)
997{
998    return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
999                cached_state, mask);
1000}
1001
1002int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1003{
1004    return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
1005}
1006
1007/*
1008 * either insert or lock state struct between start and end use mask to tell
1009 * us if waiting is desired.
1010 */
1011int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1012             int bits, struct extent_state **cached_state, gfp_t mask)
1013{
1014    int err;
1015    u64 failed_start;
1016    while (1) {
1017        err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1018                     EXTENT_LOCKED, &failed_start,
1019                     cached_state, mask);
1020        if (err == -EEXIST && (mask & __GFP_WAIT)) {
1021            wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1022            start = failed_start;
1023        } else {
1024            break;
1025        }
1026        WARN_ON(start > end);
1027    }
1028    return err;
1029}
1030
1031int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1032{
1033    return lock_extent_bits(tree, start, end, 0, NULL, mask);
1034}
1035
1036int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1037            gfp_t mask)
1038{
1039    int err;
1040    u64 failed_start;
1041
1042    err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1043                 &failed_start, NULL, mask);
1044    if (err == -EEXIST) {
1045        if (failed_start > start)
1046            clear_extent_bit(tree, start, failed_start - 1,
1047                     EXTENT_LOCKED, 1, 0, NULL, mask);
1048        return 0;
1049    }
1050    return 1;
1051}
1052
1053int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1054             struct extent_state **cached, gfp_t mask)
1055{
1056    return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1057                mask);
1058}
1059
1060int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1061          gfp_t mask)
1062{
1063    return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1064                mask);
1065}
1066
1067/*
1068 * helper function to set pages and extents in the tree dirty
1069 */
1070int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
1071{
1072    unsigned long index = start >> PAGE_CACHE_SHIFT;
1073    unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1074    struct page *page;
1075
1076    while (index <= end_index) {
1077        page = find_get_page(tree->mapping, index);
1078        BUG_ON(!page);
1079        __set_page_dirty_nobuffers(page);
1080        page_cache_release(page);
1081        index++;
1082    }
1083    return 0;
1084}
1085
1086/*
1087 * helper function to set both pages and extents in the tree writeback
1088 */
1089static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1090{
1091    unsigned long index = start >> PAGE_CACHE_SHIFT;
1092    unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1093    struct page *page;
1094
1095    while (index <= end_index) {
1096        page = find_get_page(tree->mapping, index);
1097        BUG_ON(!page);
1098        set_page_writeback(page);
1099        page_cache_release(page);
1100        index++;
1101    }
1102    return 0;
1103}
1104
1105/*
1106 * find the first offset in the io tree with 'bits' set. zero is
1107 * returned if we find something, and *start_ret and *end_ret are
1108 * set to reflect the state struct that was found.
1109 *
1110 * If nothing was found, 1 is returned, < 0 on error
1111 */
1112int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1113              u64 *start_ret, u64 *end_ret, int bits)
1114{
1115    struct rb_node *node;
1116    struct extent_state *state;
1117    int ret = 1;
1118
1119    spin_lock(&tree->lock);
1120    /*
1121     * this search will find all the extents that end after
1122     * our range starts.
1123     */
1124    node = tree_search(tree, start);
1125    if (!node)
1126        goto out;
1127
1128    while (1) {
1129        state = rb_entry(node, struct extent_state, rb_node);
1130        if (state->end >= start && (state->state & bits)) {
1131            *start_ret = state->start;
1132            *end_ret = state->end;
1133            ret = 0;
1134            break;
1135        }
1136        node = rb_next(node);
1137        if (!node)
1138            break;
1139    }
1140out:
1141    spin_unlock(&tree->lock);
1142    return ret;
1143}
1144
1145/* find the first state struct with 'bits' set after 'start', and
1146 * return it. tree->lock must be held. NULL will returned if
1147 * nothing was found after 'start'
1148 */
1149struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1150                         u64 start, int bits)
1151{
1152    struct rb_node *node;
1153    struct extent_state *state;
1154
1155    /*
1156     * this search will find all the extents that end after
1157     * our range starts.
1158     */
1159    node = tree_search(tree, start);
1160    if (!node)
1161        goto out;
1162
1163    while (1) {
1164        state = rb_entry(node, struct extent_state, rb_node);
1165        if (state->end >= start && (state->state & bits))
1166            return state;
1167
1168        node = rb_next(node);
1169        if (!node)
1170            break;
1171    }
1172out:
1173    return NULL;
1174}
1175
1176/*
1177 * find a contiguous range of bytes in the file marked as delalloc, not
1178 * more than 'max_bytes'. start and end are used to return the range,
1179 *
1180 * 1 is returned if we find something, 0 if nothing was in the tree
1181 */
1182static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1183                    u64 *start, u64 *end, u64 max_bytes,
1184                    struct extent_state **cached_state)
1185{
1186    struct rb_node *node;
1187    struct extent_state *state;
1188    u64 cur_start = *start;
1189    u64 found = 0;
1190    u64 total_bytes = 0;
1191
1192    spin_lock(&tree->lock);
1193
1194    /*
1195     * this search will find all the extents that end after
1196     * our range starts.
1197     */
1198    node = tree_search(tree, cur_start);
1199    if (!node) {
1200        if (!found)
1201            *end = (u64)-1;
1202        goto out;
1203    }
1204
1205    while (1) {
1206        state = rb_entry(node, struct extent_state, rb_node);
1207        if (found && (state->start != cur_start ||
1208                  (state->state & EXTENT_BOUNDARY))) {
1209            goto out;
1210        }
1211        if (!(state->state & EXTENT_DELALLOC)) {
1212            if (!found)
1213                *end = state->end;
1214            goto out;
1215        }
1216        if (!found) {
1217            *start = state->start;
1218            *cached_state = state;
1219            atomic_inc(&state->refs);
1220        }
1221        found++;
1222        *end = state->end;
1223        cur_start = state->end + 1;
1224        node = rb_next(node);
1225        if (!node)
1226            break;
1227        total_bytes += state->end - state->start + 1;
1228        if (total_bytes >= max_bytes)
1229            break;
1230    }
1231out:
1232    spin_unlock(&tree->lock);
1233    return found;
1234}
1235
1236static noinline int __unlock_for_delalloc(struct inode *inode,
1237                      struct page *locked_page,
1238                      u64 start, u64 end)
1239{
1240    int ret;
1241    struct page *pages[16];
1242    unsigned long index = start >> PAGE_CACHE_SHIFT;
1243    unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1244    unsigned long nr_pages = end_index - index + 1;
1245    int i;
1246
1247    if (index == locked_page->index && end_index == index)
1248        return 0;
1249
1250    while (nr_pages > 0) {
1251        ret = find_get_pages_contig(inode->i_mapping, index,
1252                     min_t(unsigned long, nr_pages,
1253                     ARRAY_SIZE(pages)), pages);
1254        for (i = 0; i < ret; i++) {
1255            if (pages[i] != locked_page)
1256                unlock_page(pages[i]);
1257            page_cache_release(pages[i]);
1258        }
1259        nr_pages -= ret;
1260        index += ret;
1261        cond_resched();
1262    }
1263    return 0;
1264}
1265
1266static noinline int lock_delalloc_pages(struct inode *inode,
1267                    struct page *locked_page,
1268                    u64 delalloc_start,
1269                    u64 delalloc_end)
1270{
1271    unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1272    unsigned long start_index = index;
1273    unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1274    unsigned long pages_locked = 0;
1275    struct page *pages[16];
1276    unsigned long nrpages;
1277    int ret;
1278    int i;
1279
1280    /* the caller is responsible for locking the start index */
1281    if (index == locked_page->index && index == end_index)
1282        return 0;
1283
1284    /* skip the page at the start index */
1285    nrpages = end_index - index + 1;
1286    while (nrpages > 0) {
1287        ret = find_get_pages_contig(inode->i_mapping, index,
1288                     min_t(unsigned long,
1289                     nrpages, ARRAY_SIZE(pages)), pages);
1290        if (ret == 0) {
1291            ret = -EAGAIN;
1292            goto done;
1293        }
1294        /* now we have an array of pages, lock them all */
1295        for (i = 0; i < ret; i++) {
1296            /*
1297             * the caller is taking responsibility for
1298             * locked_page
1299             */
1300            if (pages[i] != locked_page) {
1301                lock_page(pages[i]);
1302                if (!PageDirty(pages[i]) ||
1303                    pages[i]->mapping != inode->i_mapping) {
1304                    ret = -EAGAIN;
1305                    unlock_page(pages[i]);
1306                    page_cache_release(pages[i]);
1307                    goto done;
1308                }
1309            }
1310            page_cache_release(pages[i]);
1311            pages_locked++;
1312        }
1313        nrpages -= ret;
1314        index += ret;
1315        cond_resched();
1316    }
1317    ret = 0;
1318done:
1319    if (ret && pages_locked) {
1320        __unlock_for_delalloc(inode, locked_page,
1321                  delalloc_start,
1322                  ((u64)(start_index + pages_locked - 1)) <<
1323                  PAGE_CACHE_SHIFT);
1324    }
1325    return ret;
1326}
1327
1328/*
1329 * find a contiguous range of bytes in the file marked as delalloc, not
1330 * more than 'max_bytes'. start and end are used to return the range,
1331 *
1332 * 1 is returned if we find something, 0 if nothing was in the tree
1333 */
1334static noinline u64 find_lock_delalloc_range(struct inode *inode,
1335                         struct extent_io_tree *tree,
1336                         struct page *locked_page,
1337                         u64 *start, u64 *end,
1338                         u64 max_bytes)
1339{
1340    u64 delalloc_start;
1341    u64 delalloc_end;
1342    u64 found;
1343    struct extent_state *cached_state = NULL;
1344    int ret;
1345    int loops = 0;
1346
1347again:
1348    /* step one, find a bunch of delalloc bytes starting at start */
1349    delalloc_start = *start;
1350    delalloc_end = 0;
1351    found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1352                    max_bytes, &cached_state);
1353    if (!found || delalloc_end <= *start) {
1354        *start = delalloc_start;
1355        *end = delalloc_end;
1356        free_extent_state(cached_state);
1357        return found;
1358    }
1359
1360    /*
1361     * start comes from the offset of locked_page. We have to lock
1362     * pages in order, so we can't process delalloc bytes before
1363     * locked_page
1364     */
1365    if (delalloc_start < *start)
1366        delalloc_start = *start;
1367
1368    /*
1369     * make sure to limit the number of pages we try to lock down
1370     * if we're looping.
1371     */
1372    if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1373        delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1374
1375    /* step two, lock all the pages after the page that has start */
1376    ret = lock_delalloc_pages(inode, locked_page,
1377                  delalloc_start, delalloc_end);
1378    if (ret == -EAGAIN) {
1379        /* some of the pages are gone, lets avoid looping by
1380         * shortening the size of the delalloc range we're searching
1381         */
1382        free_extent_state(cached_state);
1383        if (!loops) {
1384            unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1385            max_bytes = PAGE_CACHE_SIZE - offset;
1386            loops = 1;
1387            goto again;
1388        } else {
1389            found = 0;
1390            goto out_failed;
1391        }
1392    }
1393    BUG_ON(ret);
1394
1395    /* step three, lock the state bits for the whole range */
1396    lock_extent_bits(tree, delalloc_start, delalloc_end,
1397             0, &cached_state, GFP_NOFS);
1398
1399    /* then test to make sure it is all still delalloc */
1400    ret = test_range_bit(tree, delalloc_start, delalloc_end,
1401                 EXTENT_DELALLOC, 1, cached_state);
1402    if (!ret) {
1403        unlock_extent_cached(tree, delalloc_start, delalloc_end,
1404                     &cached_state, GFP_NOFS);
1405        __unlock_for_delalloc(inode, locked_page,
1406                  delalloc_start, delalloc_end);
1407        cond_resched();
1408        goto again;
1409    }
1410    free_extent_state(cached_state);
1411    *start = delalloc_start;
1412    *end = delalloc_end;
1413out_failed:
1414    return found;
1415}
1416
1417int extent_clear_unlock_delalloc(struct inode *inode,
1418                struct extent_io_tree *tree,
1419                u64 start, u64 end, struct page *locked_page,
1420                unsigned long op)
1421{
1422    int ret;
1423    struct page *pages[16];
1424    unsigned long index = start >> PAGE_CACHE_SHIFT;
1425    unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1426    unsigned long nr_pages = end_index - index + 1;
1427    int i;
1428    int clear_bits = 0;
1429
1430    if (op & EXTENT_CLEAR_UNLOCK)
1431        clear_bits |= EXTENT_LOCKED;
1432    if (op & EXTENT_CLEAR_DIRTY)
1433        clear_bits |= EXTENT_DIRTY;
1434
1435    if (op & EXTENT_CLEAR_DELALLOC)
1436        clear_bits |= EXTENT_DELALLOC;
1437
1438    if (op & EXTENT_CLEAR_ACCOUNTING)
1439        clear_bits |= EXTENT_DO_ACCOUNTING;
1440
1441    clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1442    if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1443            EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1444            EXTENT_SET_PRIVATE2)))
1445        return 0;
1446
1447    while (nr_pages > 0) {
1448        ret = find_get_pages_contig(inode->i_mapping, index,
1449                     min_t(unsigned long,
1450                     nr_pages, ARRAY_SIZE(pages)), pages);
1451        for (i = 0; i < ret; i++) {
1452
1453            if (op & EXTENT_SET_PRIVATE2)
1454                SetPagePrivate2(pages[i]);
1455
1456            if (pages[i] == locked_page) {
1457                page_cache_release(pages[i]);
1458                continue;
1459            }
1460            if (op & EXTENT_CLEAR_DIRTY)
1461                clear_page_dirty_for_io(pages[i]);
1462            if (op & EXTENT_SET_WRITEBACK)
1463                set_page_writeback(pages[i]);
1464            if (op & EXTENT_END_WRITEBACK)
1465                end_page_writeback(pages[i]);
1466            if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1467                unlock_page(pages[i]);
1468            page_cache_release(pages[i]);
1469        }
1470        nr_pages -= ret;
1471        index += ret;
1472        cond_resched();
1473    }
1474    return 0;
1475}
1476
1477/*
1478 * count the number of bytes in the tree that have a given bit(s)
1479 * set. This can be fairly slow, except for EXTENT_DIRTY which is
1480 * cached. The total number found is returned.
1481 */
1482u64 count_range_bits(struct extent_io_tree *tree,
1483             u64 *start, u64 search_end, u64 max_bytes,
1484             unsigned long bits)
1485{
1486    struct rb_node *node;
1487    struct extent_state *state;
1488    u64 cur_start = *start;
1489    u64 total_bytes = 0;
1490    int found = 0;
1491
1492    if (search_end <= cur_start) {
1493        WARN_ON(1);
1494        return 0;
1495    }
1496
1497    spin_lock(&tree->lock);
1498    if (cur_start == 0 && bits == EXTENT_DIRTY) {
1499        total_bytes = tree->dirty_bytes;
1500        goto out;
1501    }
1502    /*
1503     * this search will find all the extents that end after
1504     * our range starts.
1505     */
1506    node = tree_search(tree, cur_start);
1507    if (!node)
1508        goto out;
1509
1510    while (1) {
1511        state = rb_entry(node, struct extent_state, rb_node);
1512        if (state->start > search_end)
1513            break;
1514        if (state->end >= cur_start && (state->state & bits)) {
1515            total_bytes += min(search_end, state->end) + 1 -
1516                       max(cur_start, state->start);
1517            if (total_bytes >= max_bytes)
1518                break;
1519            if (!found) {
1520                *start = state->start;
1521                found = 1;
1522            }
1523        }
1524        node = rb_next(node);
1525        if (!node)
1526            break;
1527    }
1528out:
1529    spin_unlock(&tree->lock);
1530    return total_bytes;
1531}
1532
1533/*
1534 * set the private field for a given byte offset in the tree. If there isn't
1535 * an extent_state there already, this does nothing.
1536 */
1537int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1538{
1539    struct rb_node *node;
1540    struct extent_state *state;
1541    int ret = 0;
1542
1543    spin_lock(&tree->lock);
1544    /*
1545     * this search will find all the extents that end after
1546     * our range starts.
1547     */
1548    node = tree_search(tree, start);
1549    if (!node) {
1550        ret = -ENOENT;
1551        goto out;
1552    }
1553    state = rb_entry(node, struct extent_state, rb_node);
1554    if (state->start != start) {
1555        ret = -ENOENT;
1556        goto out;
1557    }
1558    state->private = private;
1559out:
1560    spin_unlock(&tree->lock);
1561    return ret;
1562}
1563
1564int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1565{
1566    struct rb_node *node;
1567    struct extent_state *state;
1568    int ret = 0;
1569
1570    spin_lock(&tree->lock);
1571    /*
1572     * this search will find all the extents that end after
1573     * our range starts.
1574     */
1575    node = tree_search(tree, start);
1576    if (!node) {
1577        ret = -ENOENT;
1578        goto out;
1579    }
1580    state = rb_entry(node, struct extent_state, rb_node);
1581    if (state->start != start) {
1582        ret = -ENOENT;
1583        goto out;
1584    }
1585    *private = state->private;
1586out:
1587    spin_unlock(&tree->lock);
1588    return ret;
1589}
1590
1591/*
1592 * searches a range in the state tree for a given mask.
1593 * If 'filled' == 1, this returns 1 only if every extent in the tree
1594 * has the bits set. Otherwise, 1 is returned if any bit in the
1595 * range is found set.
1596 */
1597int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1598           int bits, int filled, struct extent_state *cached)
1599{
1600    struct extent_state *state = NULL;
1601    struct rb_node *node;
1602    int bitset = 0;
1603
1604    spin_lock(&tree->lock);
1605    if (cached && cached->tree && cached->start == start)
1606        node = &cached->rb_node;
1607    else
1608        node = tree_search(tree, start);
1609    while (node && start <= end) {
1610        state = rb_entry(node, struct extent_state, rb_node);
1611
1612        if (filled && state->start > start) {
1613            bitset = 0;
1614            break;
1615        }
1616
1617        if (state->start > end)
1618            break;
1619
1620        if (state->state & bits) {
1621            bitset = 1;
1622            if (!filled)
1623                break;
1624        } else if (filled) {
1625            bitset = 0;
1626            break;
1627        }
1628
1629        if (state->end == (u64)-1)
1630            break;
1631
1632        start = state->end + 1;
1633        if (start > end)
1634            break;
1635        node = rb_next(node);
1636        if (!node) {
1637            if (filled)
1638                bitset = 0;
1639            break;
1640        }
1641    }
1642    spin_unlock(&tree->lock);
1643    return bitset;
1644}
1645
1646/*
1647 * helper function to set a given page up to date if all the
1648 * extents in the tree for that page are up to date
1649 */
1650static int check_page_uptodate(struct extent_io_tree *tree,
1651                   struct page *page)
1652{
1653    u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1654    u64 end = start + PAGE_CACHE_SIZE - 1;
1655    if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1656        SetPageUptodate(page);
1657    return 0;
1658}
1659
1660/*
1661 * helper function to unlock a page if all the extents in the tree
1662 * for that page are unlocked
1663 */
1664static int check_page_locked(struct extent_io_tree *tree,
1665                 struct page *page)
1666{
1667    u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1668    u64 end = start + PAGE_CACHE_SIZE - 1;
1669    if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1670        unlock_page(page);
1671    return 0;
1672}
1673
1674/*
1675 * helper function to end page writeback if all the extents
1676 * in the tree for that page are done with writeback
1677 */
1678static int check_page_writeback(struct extent_io_tree *tree,
1679                 struct page *page)
1680{
1681    end_page_writeback(page);
1682    return 0;
1683}
1684
1685/* lots and lots of room for performance fixes in the end_bio funcs */
1686
1687/*
1688 * after a writepage IO is done, we need to:
1689 * clear the uptodate bits on error
1690 * clear the writeback bits in the extent tree for this IO
1691 * end_page_writeback if the page has no more pending IO
1692 *
1693 * Scheduling is not allowed, so the extent state tree is expected
1694 * to have one and only one object corresponding to this IO.
1695 */
1696static void end_bio_extent_writepage(struct bio *bio, int err)
1697{
1698    int uptodate = err == 0;
1699    struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1700    struct extent_io_tree *tree;
1701    u64 start;
1702    u64 end;
1703    int whole_page;
1704    int ret;
1705
1706    do {
1707        struct page *page = bvec->bv_page;
1708        tree = &BTRFS_I(page->mapping->host)->io_tree;
1709
1710        start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1711             bvec->bv_offset;
1712        end = start + bvec->bv_len - 1;
1713
1714        if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1715            whole_page = 1;
1716        else
1717            whole_page = 0;
1718
1719        if (--bvec >= bio->bi_io_vec)
1720            prefetchw(&bvec->bv_page->flags);
1721        if (tree->ops && tree->ops->writepage_end_io_hook) {
1722            ret = tree->ops->writepage_end_io_hook(page, start,
1723                               end, NULL, uptodate);
1724            if (ret)
1725                uptodate = 0;
1726        }
1727
1728        if (!uptodate && tree->ops &&
1729            tree->ops->writepage_io_failed_hook) {
1730            ret = tree->ops->writepage_io_failed_hook(bio, page,
1731                             start, end, NULL);
1732            if (ret == 0) {
1733                uptodate = (err == 0);
1734                continue;
1735            }
1736        }
1737
1738        if (!uptodate) {
1739            clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1740            ClearPageUptodate(page);
1741            SetPageError(page);
1742        }
1743
1744        if (whole_page)
1745            end_page_writeback(page);
1746        else
1747            check_page_writeback(tree, page);
1748    } while (bvec >= bio->bi_io_vec);
1749
1750    bio_put(bio);
1751}
1752
1753/*
1754 * after a readpage IO is done, we need to:
1755 * clear the uptodate bits on error
1756 * set the uptodate bits if things worked
1757 * set the page up to date if all extents in the tree are uptodate
1758 * clear the lock bit in the extent tree
1759 * unlock the page if there are no other extents locked for it
1760 *
1761 * Scheduling is not allowed, so the extent state tree is expected
1762 * to have one and only one object corresponding to this IO.
1763 */
1764static void end_bio_extent_readpage(struct bio *bio, int err)
1765{
1766    int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1767    struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1768    struct bio_vec *bvec = bio->bi_io_vec;
1769    struct extent_io_tree *tree;
1770    u64 start;
1771    u64 end;
1772    int whole_page;
1773    int ret;
1774
1775    if (err)
1776        uptodate = 0;
1777
1778    do {
1779        struct page *page = bvec->bv_page;
1780        tree = &BTRFS_I(page->mapping->host)->io_tree;
1781
1782        start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1783            bvec->bv_offset;
1784        end = start + bvec->bv_len - 1;
1785
1786        if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1787            whole_page = 1;
1788        else
1789            whole_page = 0;
1790
1791        if (++bvec <= bvec_end)
1792            prefetchw(&bvec->bv_page->flags);
1793
1794        if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1795            ret = tree->ops->readpage_end_io_hook(page, start, end,
1796                                  NULL);
1797            if (ret)
1798                uptodate = 0;
1799        }
1800        if (!uptodate && tree->ops &&
1801            tree->ops->readpage_io_failed_hook) {
1802            ret = tree->ops->readpage_io_failed_hook(bio, page,
1803                             start, end, NULL);
1804            if (ret == 0) {
1805                uptodate =
1806                    test_bit(BIO_UPTODATE, &bio->bi_flags);
1807                if (err)
1808                    uptodate = 0;
1809                continue;
1810            }
1811        }
1812
1813        if (uptodate) {
1814            set_extent_uptodate(tree, start, end,
1815                        GFP_ATOMIC);
1816        }
1817        unlock_extent(tree, start, end, GFP_ATOMIC);
1818
1819        if (whole_page) {
1820            if (uptodate) {
1821                SetPageUptodate(page);
1822            } else {
1823                ClearPageUptodate(page);
1824                SetPageError(page);
1825            }
1826            unlock_page(page);
1827        } else {
1828            if (uptodate) {
1829                check_page_uptodate(tree, page);
1830            } else {
1831                ClearPageUptodate(page);
1832                SetPageError(page);
1833            }
1834            check_page_locked(tree, page);
1835        }
1836    } while (bvec <= bvec_end);
1837
1838    bio_put(bio);
1839}
1840
1841/*
1842 * IO done from prepare_write is pretty simple, we just unlock
1843 * the structs in the extent tree when done, and set the uptodate bits
1844 * as appropriate.
1845 */
1846static void end_bio_extent_preparewrite(struct bio *bio, int err)
1847{
1848    const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1849    struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1850    struct extent_io_tree *tree;
1851    u64 start;
1852    u64 end;
1853
1854    do {
1855        struct page *page = bvec->bv_page;
1856        tree = &BTRFS_I(page->mapping->host)->io_tree;
1857
1858        start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1859            bvec->bv_offset;
1860        end = start + bvec->bv_len - 1;
1861
1862        if (--bvec >= bio->bi_io_vec)
1863            prefetchw(&bvec->bv_page->flags);
1864
1865        if (uptodate) {
1866            set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1867        } else {
1868            ClearPageUptodate(page);
1869            SetPageError(page);
1870        }
1871
1872        unlock_extent(tree, start, end, GFP_ATOMIC);
1873
1874    } while (bvec >= bio->bi_io_vec);
1875
1876    bio_put(bio);
1877}
1878
1879static struct bio *
1880extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1881         gfp_t gfp_flags)
1882{
1883    struct bio *bio;
1884
1885    bio = bio_alloc(gfp_flags, nr_vecs);
1886
1887    if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1888        while (!bio && (nr_vecs /= 2))
1889            bio = bio_alloc(gfp_flags, nr_vecs);
1890    }
1891
1892    if (bio) {
1893        bio->bi_size = 0;
1894        bio->bi_bdev = bdev;
1895        bio->bi_sector = first_sector;
1896    }
1897    return bio;
1898}
1899
1900static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1901              unsigned long bio_flags)
1902{
1903    int ret = 0;
1904    struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1905    struct page *page = bvec->bv_page;
1906    struct extent_io_tree *tree = bio->bi_private;
1907    u64 start;
1908    u64 end;
1909
1910    start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1911    end = start + bvec->bv_len - 1;
1912
1913    bio->bi_private = NULL;
1914
1915    bio_get(bio);
1916
1917    if (tree->ops && tree->ops->submit_bio_hook)
1918        tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1919                       mirror_num, bio_flags);
1920    else
1921        submit_bio(rw, bio);
1922    if (bio_flagged(bio, BIO_EOPNOTSUPP))
1923        ret = -EOPNOTSUPP;
1924    bio_put(bio);
1925    return ret;
1926}
1927
1928static int submit_extent_page(int rw, struct extent_io_tree *tree,
1929                  struct page *page, sector_t sector,
1930                  size_t size, unsigned long offset,
1931                  struct block_device *bdev,
1932                  struct bio **bio_ret,
1933                  unsigned long max_pages,
1934                  bio_end_io_t end_io_func,
1935                  int mirror_num,
1936                  unsigned long prev_bio_flags,
1937                  unsigned long bio_flags)
1938{
1939    int ret = 0;
1940    struct bio *bio;
1941    int nr;
1942    int contig = 0;
1943    int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1944    int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1945    size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1946
1947    if (bio_ret && *bio_ret) {
1948        bio = *bio_ret;
1949        if (old_compressed)
1950            contig = bio->bi_sector == sector;
1951        else
1952            contig = bio->bi_sector + (bio->bi_size >> 9) ==
1953                sector;
1954
1955        if (prev_bio_flags != bio_flags || !contig ||
1956            (tree->ops && tree->ops->merge_bio_hook &&
1957             tree->ops->merge_bio_hook(page, offset, page_size, bio,
1958                           bio_flags)) ||
1959            bio_add_page(bio, page, page_size, offset) < page_size) {
1960            ret = submit_one_bio(rw, bio, mirror_num,
1961                         prev_bio_flags);
1962            bio = NULL;
1963        } else {
1964            return 0;
1965        }
1966    }
1967    if (this_compressed)
1968        nr = BIO_MAX_PAGES;
1969    else
1970        nr = bio_get_nr_vecs(bdev);
1971
1972    bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1973
1974    bio_add_page(bio, page, page_size, offset);
1975    bio->bi_end_io = end_io_func;
1976    bio->bi_private = tree;
1977
1978    if (bio_ret)
1979        *bio_ret = bio;
1980    else
1981        ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1982
1983    return ret;
1984}
1985
1986void set_page_extent_mapped(struct page *page)
1987{
1988    if (!PagePrivate(page)) {
1989        SetPagePrivate(page);
1990        page_cache_get(page);
1991        set_page_private(page, EXTENT_PAGE_PRIVATE);
1992    }
1993}
1994
1995static void set_page_extent_head(struct page *page, unsigned long len)
1996{
1997    set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1998}
1999
2000/*
2001 * basic readpage implementation. Locked extent state structs are inserted
2002 * into the tree that are removed when the IO is done (by the end_io
2003 * handlers)
2004 */
2005static int __extent_read_full_page(struct extent_io_tree *tree,
2006                   struct page *page,
2007                   get_extent_t *get_extent,
2008                   struct bio **bio, int mirror_num,
2009                   unsigned long *bio_flags)
2010{
2011    struct inode *inode = page->mapping->host;
2012    u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2013    u64 page_end = start + PAGE_CACHE_SIZE - 1;
2014    u64 end;
2015    u64 cur = start;
2016    u64 extent_offset;
2017    u64 last_byte = i_size_read(inode);
2018    u64 block_start;
2019    u64 cur_end;
2020    sector_t sector;
2021    struct extent_map *em;
2022    struct block_device *bdev;
2023    int ret;
2024    int nr = 0;
2025    size_t page_offset = 0;
2026    size_t iosize;
2027    size_t disk_io_size;
2028    size_t blocksize = inode->i_sb->s_blocksize;
2029    unsigned long this_bio_flag = 0;
2030
2031    set_page_extent_mapped(page);
2032
2033    end = page_end;
2034    lock_extent(tree, start, end, GFP_NOFS);
2035
2036    if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2037        char *userpage;
2038        size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2039
2040        if (zero_offset) {
2041            iosize = PAGE_CACHE_SIZE - zero_offset;
2042            userpage = kmap_atomic(page, KM_USER0);
2043            memset(userpage + zero_offset, 0, iosize);
2044            flush_dcache_page(page);
2045            kunmap_atomic(userpage, KM_USER0);
2046        }
2047    }
2048    while (cur <= end) {
2049        if (cur >= last_byte) {
2050            char *userpage;
2051            iosize = PAGE_CACHE_SIZE - page_offset;
2052            userpage = kmap_atomic(page, KM_USER0);
2053            memset(userpage + page_offset, 0, iosize);
2054            flush_dcache_page(page);
2055            kunmap_atomic(userpage, KM_USER0);
2056            set_extent_uptodate(tree, cur, cur + iosize - 1,
2057                        GFP_NOFS);
2058            unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2059            break;
2060        }
2061        em = get_extent(inode, page, page_offset, cur,
2062                end - cur + 1, 0);
2063        if (IS_ERR(em) || !em) {
2064            SetPageError(page);
2065            unlock_extent(tree, cur, end, GFP_NOFS);
2066            break;
2067        }
2068        extent_offset = cur - em->start;
2069        BUG_ON(extent_map_end(em) <= cur);
2070        BUG_ON(end < cur);
2071
2072        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2073            this_bio_flag = EXTENT_BIO_COMPRESSED;
2074
2075        iosize = min(extent_map_end(em) - cur, end - cur + 1);
2076        cur_end = min(extent_map_end(em) - 1, end);
2077        iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2078        if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2079            disk_io_size = em->block_len;
2080            sector = em->block_start >> 9;
2081        } else {
2082            sector = (em->block_start + extent_offset) >> 9;
2083            disk_io_size = iosize;
2084        }
2085        bdev = em->bdev;
2086        block_start = em->block_start;
2087        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2088            block_start = EXTENT_MAP_HOLE;
2089        free_extent_map(em);
2090        em = NULL;
2091
2092        /* we've found a hole, just zero and go on */
2093        if (block_start == EXTENT_MAP_HOLE) {
2094            char *userpage;
2095            userpage = kmap_atomic(page, KM_USER0);
2096            memset(userpage + page_offset, 0, iosize);
2097            flush_dcache_page(page);
2098            kunmap_atomic(userpage, KM_USER0);
2099
2100            set_extent_uptodate(tree, cur, cur + iosize - 1,
2101                        GFP_NOFS);
2102            unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2103            cur = cur + iosize;
2104            page_offset += iosize;
2105            continue;
2106        }
2107        /* the get_extent function already copied into the page */
2108        if (test_range_bit(tree, cur, cur_end,
2109                   EXTENT_UPTODATE, 1, NULL)) {
2110            check_page_uptodate(tree, page);
2111            unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2112            cur = cur + iosize;
2113            page_offset += iosize;
2114            continue;
2115        }
2116        /* we have an inline extent but it didn't get marked up
2117         * to date. Error out
2118         */
2119        if (block_start == EXTENT_MAP_INLINE) {
2120            SetPageError(page);
2121            unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2122            cur = cur + iosize;
2123            page_offset += iosize;
2124            continue;
2125        }
2126
2127        ret = 0;
2128        if (tree->ops && tree->ops->readpage_io_hook) {
2129            ret = tree->ops->readpage_io_hook(page, cur,
2130                              cur + iosize - 1);
2131        }
2132        if (!ret) {
2133            unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2134            pnr -= page->index;
2135            ret = submit_extent_page(READ, tree, page,
2136                     sector, disk_io_size, page_offset,
2137                     bdev, bio, pnr,
2138                     end_bio_extent_readpage, mirror_num,
2139                     *bio_flags,
2140                     this_bio_flag);
2141            nr++;
2142            *bio_flags = this_bio_flag;
2143        }
2144        if (ret)
2145            SetPageError(page);
2146        cur = cur + iosize;
2147        page_offset += iosize;
2148    }
2149    if (!nr) {
2150        if (!PageError(page))
2151            SetPageUptodate(page);
2152        unlock_page(page);
2153    }
2154    return 0;
2155}
2156
2157int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2158                get_extent_t *get_extent)
2159{
2160    struct bio *bio = NULL;
2161    unsigned long bio_flags = 0;
2162    int ret;
2163
2164    ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2165                      &bio_flags);
2166    if (bio)
2167        submit_one_bio(READ, bio, 0, bio_flags);
2168    return ret;
2169}
2170
2171static noinline void update_nr_written(struct page *page,
2172                      struct writeback_control *wbc,
2173                      unsigned long nr_written)
2174{
2175    wbc->nr_to_write -= nr_written;
2176    if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2177        wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2178        page->mapping->writeback_index = page->index + nr_written;
2179}
2180
2181/*
2182 * the writepage semantics are similar to regular writepage. extent
2183 * records are inserted to lock ranges in the tree, and as dirty areas
2184 * are found, they are marked writeback. Then the lock bits are removed
2185 * and the end_io handler clears the writeback ranges
2186 */
2187static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2188                  void *data)
2189{
2190    struct inode *inode = page->mapping->host;
2191    struct extent_page_data *epd = data;
2192    struct extent_io_tree *tree = epd->tree;
2193    u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2194    u64 delalloc_start;
2195    u64 page_end = start + PAGE_CACHE_SIZE - 1;
2196    u64 end;
2197    u64 cur = start;
2198    u64 extent_offset;
2199    u64 last_byte = i_size_read(inode);
2200    u64 block_start;
2201    u64 iosize;
2202    u64 unlock_start;
2203    sector_t sector;
2204    struct extent_state *cached_state = NULL;
2205    struct extent_map *em;
2206    struct block_device *bdev;
2207    int ret;
2208    int nr = 0;
2209    size_t pg_offset = 0;
2210    size_t blocksize;
2211    loff_t i_size = i_size_read(inode);
2212    unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2213    u64 nr_delalloc;
2214    u64 delalloc_end;
2215    int page_started;
2216    int compressed;
2217    int write_flags;
2218    unsigned long nr_written = 0;
2219
2220    if (wbc->sync_mode == WB_SYNC_ALL)
2221        write_flags = WRITE_SYNC_PLUG;
2222    else
2223        write_flags = WRITE;
2224
2225    WARN_ON(!PageLocked(page));
2226    pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2227    if (page->index > end_index ||
2228       (page->index == end_index && !pg_offset)) {
2229        page->mapping->a_ops->invalidatepage(page, 0);
2230        unlock_page(page);
2231        return 0;
2232    }
2233
2234    if (page->index == end_index) {
2235        char *userpage;
2236
2237        userpage = kmap_atomic(page, KM_USER0);
2238        memset(userpage + pg_offset, 0,
2239               PAGE_CACHE_SIZE - pg_offset);
2240        kunmap_atomic(userpage, KM_USER0);
2241        flush_dcache_page(page);
2242    }
2243    pg_offset = 0;
2244
2245    set_page_extent_mapped(page);
2246
2247    delalloc_start = start;
2248    delalloc_end = 0;
2249    page_started = 0;
2250    if (!epd->extent_locked) {
2251        u64 delalloc_to_write = 0;
2252        /*
2253         * make sure the wbc mapping index is at least updated
2254         * to this page.
2255         */
2256        update_nr_written(page, wbc, 0);
2257
2258        while (delalloc_end < page_end) {
2259            nr_delalloc = find_lock_delalloc_range(inode, tree,
2260                               page,
2261                               &delalloc_start,
2262                               &delalloc_end,
2263                               128 * 1024 * 1024);
2264            if (nr_delalloc == 0) {
2265                delalloc_start = delalloc_end + 1;
2266                continue;
2267            }
2268            tree->ops->fill_delalloc(inode, page, delalloc_start,
2269                         delalloc_end, &page_started,
2270                         &nr_written);
2271            /*
2272             * delalloc_end is already one less than the total
2273             * length, so we don't subtract one from
2274             * PAGE_CACHE_SIZE
2275             */
2276            delalloc_to_write += (delalloc_end - delalloc_start +
2277                          PAGE_CACHE_SIZE) >>
2278                          PAGE_CACHE_SHIFT;
2279            delalloc_start = delalloc_end + 1;
2280        }
2281        if (wbc->nr_to_write < delalloc_to_write) {
2282            int thresh = 8192;
2283
2284            if (delalloc_to_write < thresh * 2)
2285                thresh = delalloc_to_write;
2286            wbc->nr_to_write = min_t(u64, delalloc_to_write,
2287                         thresh);
2288        }
2289
2290        /* did the fill delalloc function already unlock and start
2291         * the IO?
2292         */
2293        if (page_started) {
2294            ret = 0;
2295            /*
2296             * we've unlocked the page, so we can't update
2297             * the mapping's writeback index, just update
2298             * nr_to_write.
2299             */
2300            wbc->nr_to_write -= nr_written;
2301            goto done_unlocked;
2302        }
2303    }
2304    if (tree->ops && tree->ops->writepage_start_hook) {
2305        ret = tree->ops->writepage_start_hook(page, start,
2306                              page_end);
2307        if (ret == -EAGAIN) {
2308            redirty_page_for_writepage(wbc, page);
2309            update_nr_written(page, wbc, nr_written);
2310            unlock_page(page);
2311            ret = 0;
2312            goto done_unlocked;
2313        }
2314    }
2315
2316    /*
2317     * we don't want to touch the inode after unlocking the page,
2318     * so we update the mapping writeback index now
2319     */
2320    update_nr_written(page, wbc, nr_written + 1);
2321
2322    end = page_end;
2323    if (last_byte <= start) {
2324        if (tree->ops && tree->ops->writepage_end_io_hook)
2325            tree->ops->writepage_end_io_hook(page, start,
2326                             page_end, NULL, 1);
2327        unlock_start = page_end + 1;
2328        goto done;
2329    }
2330
2331    blocksize = inode->i_sb->s_blocksize;
2332
2333    while (cur <= end) {
2334        if (cur >= last_byte) {
2335            if (tree->ops && tree->ops->writepage_end_io_hook)
2336                tree->ops->writepage_end_io_hook(page, cur,
2337                             page_end, NULL, 1);
2338            unlock_start = page_end + 1;
2339            break;
2340        }
2341        em = epd->get_extent(inode, page, pg_offset, cur,
2342                     end - cur + 1, 1);
2343        if (IS_ERR(em) || !em) {
2344            SetPageError(page);
2345            break;
2346        }
2347
2348        extent_offset = cur - em->start;
2349        BUG_ON(extent_map_end(em) <= cur);
2350        BUG_ON(end < cur);
2351        iosize = min(extent_map_end(em) - cur, end - cur + 1);
2352        iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2353        sector = (em->block_start + extent_offset) >> 9;
2354        bdev = em->bdev;
2355        block_start = em->block_start;
2356        compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2357        free_extent_map(em);
2358        em = NULL;
2359
2360        /*
2361         * compressed and inline extents are written through other
2362         * paths in the FS
2363         */
2364        if (compressed || block_start == EXTENT_MAP_HOLE ||
2365            block_start == EXTENT_MAP_INLINE) {
2366            /*
2367             * end_io notification does not happen here for
2368             * compressed extents
2369             */
2370            if (!compressed && tree->ops &&
2371                tree->ops->writepage_end_io_hook)
2372                tree->ops->writepage_end_io_hook(page, cur,
2373                             cur + iosize - 1,
2374                             NULL, 1);
2375            else if (compressed) {
2376                /* we don't want to end_page_writeback on
2377                 * a compressed extent. this happens
2378                 * elsewhere
2379                 */
2380                nr++;
2381            }
2382
2383            cur += iosize;
2384            pg_offset += iosize;
2385            unlock_start = cur;
2386            continue;
2387        }
2388        /* leave this out until we have a page_mkwrite call */
2389        if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2390                   EXTENT_DIRTY, 0, NULL)) {
2391            cur = cur + iosize;
2392            pg_offset += iosize;
2393            continue;
2394        }
2395
2396        if (tree->ops && tree->ops->writepage_io_hook) {
2397            ret = tree->ops->writepage_io_hook(page, cur,
2398                        cur + iosize - 1);
2399        } else {
2400            ret = 0;
2401        }
2402        if (ret) {
2403            SetPageError(page);
2404        } else {
2405            unsigned long max_nr = end_index + 1;
2406
2407            set_range_writeback(tree, cur, cur + iosize - 1);
2408            if (!PageWriteback(page)) {
2409                printk(KERN_ERR "btrfs warning page %lu not "
2410                       "writeback, cur %llu end %llu\n",
2411                       page->index, (unsigned long long)cur,
2412                       (unsigned long long)end);
2413            }
2414
2415            ret = submit_extent_page(write_flags, tree, page,
2416                         sector, iosize, pg_offset,
2417                         bdev, &epd->bio, max_nr,
2418                         end_bio_extent_writepage,
2419                         0, 0, 0);
2420            if (ret)
2421                SetPageError(page);
2422        }
2423        cur = cur + iosize;
2424        pg_offset += iosize;
2425        nr++;
2426    }
2427done:
2428    if (nr == 0) {
2429        /* make sure the mapping tag for page dirty gets cleared */
2430        set_page_writeback(page);
2431        end_page_writeback(page);
2432    }
2433    unlock_page(page);
2434
2435done_unlocked:
2436
2437    /* drop our reference on any cached states */
2438    free_extent_state(cached_state);
2439    return 0;
2440}
2441
2442/**
2443 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2444 * @mapping: address space structure to write
2445 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2446 * @writepage: function called for each page
2447 * @data: data passed to writepage function
2448 *
2449 * If a page is already under I/O, write_cache_pages() skips it, even
2450 * if it's dirty. This is desirable behaviour for memory-cleaning writeback,
2451 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
2452 * and msync() need to guarantee that all the data which was dirty at the time
2453 * the call was made get new I/O started against them. If wbc->sync_mode is
2454 * WB_SYNC_ALL then we were called for data integrity and we must wait for
2455 * existing IO to complete.
2456 */
2457static int extent_write_cache_pages(struct extent_io_tree *tree,
2458                 struct address_space *mapping,
2459                 struct writeback_control *wbc,
2460                 writepage_t writepage, void *data,
2461                 void (*flush_fn)(void *))
2462{
2463    int ret = 0;
2464    int done = 0;
2465    int nr_to_write_done = 0;
2466    struct pagevec pvec;
2467    int nr_pages;
2468    pgoff_t index;
2469    pgoff_t end; /* Inclusive */
2470    int scanned = 0;
2471    int range_whole = 0;
2472
2473    pagevec_init(&pvec, 0);
2474    if (wbc->range_cyclic) {
2475        index = mapping->writeback_index; /* Start from prev offset */
2476        end = -1;
2477    } else {
2478        index = wbc->range_start >> PAGE_CACHE_SHIFT;
2479        end = wbc->range_end >> PAGE_CACHE_SHIFT;
2480        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2481            range_whole = 1;
2482        scanned = 1;
2483    }
2484retry:
2485    while (!done && !nr_to_write_done && (index <= end) &&
2486           (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2487                  PAGECACHE_TAG_DIRTY, min(end - index,
2488                  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2489        unsigned i;
2490
2491        scanned = 1;
2492        for (i = 0; i < nr_pages; i++) {
2493            struct page *page = pvec.pages[i];
2494
2495            /*
2496             * At this point we hold neither mapping->tree_lock nor
2497             * lock on the page itself: the page may be truncated or
2498             * invalidated (changing page->mapping to NULL), or even
2499             * swizzled back from swapper_space to tmpfs file
2500             * mapping
2501             */
2502            if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2503                tree->ops->write_cache_pages_lock_hook(page);
2504            else
2505                lock_page(page);
2506
2507            if (unlikely(page->mapping != mapping)) {
2508                unlock_page(page);
2509                continue;
2510            }
2511
2512            if (!wbc->range_cyclic && page->index > end) {
2513                done = 1;
2514                unlock_page(page);
2515                continue;
2516            }
2517
2518            if (wbc->sync_mode != WB_SYNC_NONE) {
2519                if (PageWriteback(page))
2520                    flush_fn(data);
2521                wait_on_page_writeback(page);
2522            }
2523
2524            if (PageWriteback(page) ||
2525                !clear_page_dirty_for_io(page)) {
2526                unlock_page(page);
2527                continue;
2528            }
2529
2530            ret = (*writepage)(page, wbc, data);
2531
2532            if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2533                unlock_page(page);
2534                ret = 0;
2535            }
2536            if (ret)
2537                done = 1;
2538
2539            /*
2540             * the filesystem may choose to bump up nr_to_write.
2541             * We have to make sure to honor the new nr_to_write
2542             * at any time
2543             */
2544            nr_to_write_done = wbc->nr_to_write <= 0;
2545        }
2546        pagevec_release(&pvec);
2547        cond_resched();
2548    }
2549    if (!scanned && !done) {
2550        /*
2551         * We hit the last page and there is more work to be done: wrap
2552         * back to the start of the file
2553         */
2554        scanned = 1;
2555        index = 0;
2556        goto retry;
2557    }
2558    return ret;
2559}
2560
2561static void flush_epd_write_bio(struct extent_page_data *epd)
2562{
2563    if (epd->bio) {
2564        if (epd->sync_io)
2565            submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2566        else
2567            submit_one_bio(WRITE, epd->bio, 0, 0);
2568        epd->bio = NULL;
2569    }
2570}
2571
2572static noinline void flush_write_bio(void *data)
2573{
2574    struct extent_page_data *epd = data;
2575    flush_epd_write_bio(epd);
2576}
2577
2578int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2579              get_extent_t *get_extent,
2580              struct writeback_control *wbc)
2581{
2582    int ret;
2583    struct address_space *mapping = page->mapping;
2584    struct extent_page_data epd = {
2585        .bio = NULL,
2586        .tree = tree,
2587        .get_extent = get_extent,
2588        .extent_locked = 0,
2589        .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2590    };
2591    struct writeback_control wbc_writepages = {
2592        .bdi = wbc->bdi,
2593        .sync_mode = wbc->sync_mode,
2594        .older_than_this = NULL,
2595        .nr_to_write = 64,
2596        .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2597        .range_end = (loff_t)-1,
2598    };
2599
2600    ret = __extent_writepage(page, wbc, &epd);
2601
2602    extent_write_cache_pages(tree, mapping, &wbc_writepages,
2603                 __extent_writepage, &epd, flush_write_bio);
2604    flush_epd_write_bio(&epd);
2605    return ret;
2606}
2607
2608int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2609                  u64 start, u64 end, get_extent_t *get_extent,
2610                  int mode)
2611{
2612    int ret = 0;
2613    struct address_space *mapping = inode->i_mapping;
2614    struct page *page;
2615    unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2616        PAGE_CACHE_SHIFT;
2617
2618    struct extent_page_data epd = {
2619        .bio = NULL,
2620        .tree = tree,
2621        .get_extent = get_extent,
2622        .extent_locked = 1,
2623        .sync_io = mode == WB_SYNC_ALL,
2624    };
2625    struct writeback_control wbc_writepages = {
2626        .bdi = inode->i_mapping->backing_dev_info,
2627        .sync_mode = mode,
2628        .older_than_this = NULL,
2629        .nr_to_write = nr_pages * 2,
2630        .range_start = start,
2631        .range_end = end + 1,
2632    };
2633
2634    while (start <= end) {
2635        page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2636        if (clear_page_dirty_for_io(page))
2637            ret = __extent_writepage(page, &wbc_writepages, &epd);
2638        else {
2639            if (tree->ops && tree->ops->writepage_end_io_hook)
2640                tree->ops->writepage_end_io_hook(page, start,
2641                         start + PAGE_CACHE_SIZE - 1,
2642                         NULL, 1);
2643            unlock_page(page);
2644        }
2645        page_cache_release(page);
2646        start += PAGE_CACHE_SIZE;
2647    }
2648
2649    flush_epd_write_bio(&epd);
2650    return ret;
2651}
2652
2653int extent_writepages(struct extent_io_tree *tree,
2654              struct address_space *mapping,
2655              get_extent_t *get_extent,
2656              struct writeback_control *wbc)
2657{
2658    int ret = 0;
2659    struct extent_page_data epd = {
2660        .bio = NULL,
2661        .tree = tree,
2662        .get_extent = get_extent,
2663        .extent_locked = 0,
2664        .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2665    };
2666
2667    ret = extent_write_cache_pages(tree, mapping, wbc,
2668                       __extent_writepage, &epd,
2669                       flush_write_bio);
2670    flush_epd_write_bio(&epd);
2671    return ret;
2672}
2673
2674int extent_readpages(struct extent_io_tree *tree,
2675             struct address_space *mapping,
2676             struct list_head *pages, unsigned nr_pages,
2677             get_extent_t get_extent)
2678{
2679    struct bio *bio = NULL;
2680    unsigned page_idx;
2681    unsigned long bio_flags = 0;
2682
2683    for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2684        struct page *page = list_entry(pages->prev, struct page, lru);
2685
2686        prefetchw(&page->flags);
2687        list_del(&page->lru);
2688        if (!add_to_page_cache_lru(page, mapping,
2689                    page->index, GFP_KERNEL)) {
2690            __extent_read_full_page(tree, page, get_extent,
2691                        &bio, 0, &bio_flags);
2692        }
2693        page_cache_release(page);
2694    }
2695    BUG_ON(!list_empty(pages));
2696    if (bio)
2697        submit_one_bio(READ, bio, 0, bio_flags);
2698    return 0;
2699}
2700
2701/*
2702 * basic invalidatepage code, this waits on any locked or writeback
2703 * ranges corresponding to the page, and then deletes any extent state
2704 * records from the tree
2705 */
2706int extent_invalidatepage(struct extent_io_tree *tree,
2707              struct page *page, unsigned long offset)
2708{
2709    struct extent_state *cached_state = NULL;
2710    u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2711    u64 end = start + PAGE_CACHE_SIZE - 1;
2712    size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2713
2714    start += (offset + blocksize - 1) & ~(blocksize - 1);
2715    if (start > end)
2716        return 0;
2717
2718    lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2719    wait_on_page_writeback(page);
2720    clear_extent_bit(tree, start, end,
2721             EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2722             EXTENT_DO_ACCOUNTING,
2723             1, 1, &cached_state, GFP_NOFS);
2724    return 0;
2725}
2726
2727/*
2728 * simple commit_write call, set_range_dirty is used to mark both
2729 * the pages and the extent records as dirty
2730 */
2731int extent_commit_write(struct extent_io_tree *tree,
2732            struct inode *inode, struct page *page,
2733            unsigned from, unsigned to)
2734{
2735    loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2736
2737    set_page_extent_mapped(page);
2738    set_page_dirty(page);
2739
2740    if (pos > inode->i_size) {
2741        i_size_write(inode, pos);
2742        mark_inode_dirty(inode);
2743    }
2744    return 0;
2745}
2746
2747int extent_prepare_write(struct extent_io_tree *tree,
2748             struct inode *inode, struct page *page,
2749             unsigned from, unsigned to, get_extent_t *get_extent)
2750{
2751    u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2752    u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2753    u64 block_start;
2754    u64 orig_block_start;
2755    u64 block_end;
2756    u64 cur_end;
2757    struct extent_map *em;
2758    unsigned blocksize = 1 << inode->i_blkbits;
2759    size_t page_offset = 0;
2760    size_t block_off_start;
2761    size_t block_off_end;
2762    int err = 0;
2763    int iocount = 0;
2764    int ret = 0;
2765    int isnew;
2766
2767    set_page_extent_mapped(page);
2768
2769    block_start = (page_start + from) & ~((u64)blocksize - 1);
2770    block_end = (page_start + to - 1) | (blocksize - 1);
2771    orig_block_start = block_start;
2772
2773    lock_extent(tree, page_start, page_end, GFP_NOFS);
2774    while (block_start <= block_end) {
2775        em = get_extent(inode, page, page_offset, block_start,
2776                block_end - block_start + 1, 1);
2777        if (IS_ERR(em) || !em)
2778            goto err;
2779
2780        cur_end = min(block_end, extent_map_end(em) - 1);
2781        block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2782        block_off_end = block_off_start + blocksize;
2783        isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2784
2785        if (!PageUptodate(page) && isnew &&
2786            (block_off_end > to || block_off_start < from)) {
2787            void *kaddr;
2788
2789            kaddr = kmap_atomic(page, KM_USER0);
2790            if (block_off_end > to)
2791                memset(kaddr + to, 0, block_off_end - to);
2792            if (block_off_start < from)
2793                memset(kaddr + block_off_start, 0,
2794                       from - block_off_start);
2795            flush_dcache_page(page);
2796            kunmap_atomic(kaddr, KM_USER0);
2797        }
2798        if ((em->block_start != EXTENT_MAP_HOLE &&
2799             em->block_start != EXTENT_MAP_INLINE) &&
2800            !isnew && !PageUptodate(page) &&
2801            (block_off_end > to || block_off_start < from) &&
2802            !test_range_bit(tree, block_start, cur_end,
2803                    EXTENT_UPTODATE, 1, NULL)) {
2804            u64 sector;
2805            u64 extent_offset = block_start - em->start;
2806            size_t iosize;
2807            sector = (em->block_start + extent_offset) >> 9;
2808            iosize = (cur_end - block_start + blocksize) &
2809                ~((u64)blocksize - 1);
2810            /*
2811             * we've already got the extent locked, but we
2812             * need to split the state such that our end_bio
2813             * handler can clear the lock.
2814             */
2815            set_extent_bit(tree, block_start,
2816                       block_start + iosize - 1,
2817                       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
2818            ret = submit_extent_page(READ, tree, page,
2819                     sector, iosize, page_offset, em->bdev,
2820                     NULL, 1,
2821                     end_bio_extent_preparewrite, 0,
2822                     0, 0);
2823            iocount++;
2824            block_start = block_start + iosize;
2825        } else {
2826            set_extent_uptodate(tree, block_start, cur_end,
2827                        GFP_NOFS);
2828            unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2829            block_start = cur_end + 1;
2830        }
2831        page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2832        free_extent_map(em);
2833    }
2834    if (iocount) {
2835        wait_extent_bit(tree, orig_block_start,
2836                block_end, EXTENT_LOCKED);
2837    }
2838    check_page_uptodate(tree, page);
2839err:
2840    /* FIXME, zero out newly allocated blocks on error */
2841    return err;
2842}
2843
2844/*
2845 * a helper for releasepage, this tests for areas of the page that
2846 * are locked or under IO and drops the related state bits if it is safe
2847 * to drop the page.
2848 */
2849int try_release_extent_state(struct extent_map_tree *map,
2850                 struct extent_io_tree *tree, struct page *page,
2851                 gfp_t mask)
2852{
2853    u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2854    u64 end = start + PAGE_CACHE_SIZE - 1;
2855    int ret = 1;
2856
2857    if (test_range_bit(tree, start, end,
2858               EXTENT_IOBITS, 0, NULL))
2859        ret = 0;
2860    else {
2861        if ((mask & GFP_NOFS) == GFP_NOFS)
2862            mask = GFP_NOFS;
2863        /*
2864         * at this point we can safely clear everything except the
2865         * locked bit and the nodatasum bit
2866         */
2867        clear_extent_bit(tree, start, end,
2868                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2869                 0, 0, NULL, mask);
2870    }
2871    return ret;
2872}
2873
2874/*
2875 * a helper for releasepage. As long as there are no locked extents
2876 * in the range corresponding to the page, both state records and extent
2877 * map records are removed
2878 */
2879int try_release_extent_mapping(struct extent_map_tree *map,
2880                   struct extent_io_tree *tree, struct page *page,
2881                   gfp_t mask)
2882{
2883    struct extent_map *em;
2884    u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2885    u64 end = start + PAGE_CACHE_SIZE - 1;
2886
2887    if ((mask & __GFP_WAIT) &&
2888        page->mapping->host->i_size > 16 * 1024 * 1024) {
2889        u64 len;
2890        while (start <= end) {
2891            len = end - start + 1;
2892            write_lock(&map->lock);
2893            em = lookup_extent_mapping(map, start, len);
2894            if (!em || IS_ERR(em)) {
2895                write_unlock(&map->lock);
2896                break;
2897            }
2898            if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2899                em->start != start) {
2900                write_unlock(&map->lock);
2901                free_extent_map(em);
2902                break;
2903            }
2904            if (!test_range_bit(tree, em->start,
2905                        extent_map_end(em) - 1,
2906                        EXTENT_LOCKED | EXTENT_WRITEBACK,
2907                        0, NULL)) {
2908                remove_extent_mapping(map, em);
2909                /* once for the rb tree */
2910                free_extent_map(em);
2911            }
2912            start = extent_map_end(em);
2913            write_unlock(&map->lock);
2914
2915            /* once for us */
2916            free_extent_map(em);
2917        }
2918    }
2919    return try_release_extent_state(map, tree, page, mask);
2920}
2921
2922sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2923        get_extent_t *get_extent)
2924{
2925    struct inode *inode = mapping->host;
2926    struct extent_state *cached_state = NULL;
2927    u64 start = iblock << inode->i_blkbits;
2928    sector_t sector = 0;
2929    size_t blksize = (1 << inode->i_blkbits);
2930    struct extent_map *em;
2931
2932    lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2933             0, &cached_state, GFP_NOFS);
2934    em = get_extent(inode, NULL, 0, start, blksize, 0);
2935    unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2936                 start + blksize - 1, &cached_state, GFP_NOFS);
2937    if (!em || IS_ERR(em))
2938        return 0;
2939
2940    if (em->block_start > EXTENT_MAP_LAST_BYTE)
2941        goto out;
2942
2943    sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2944out:
2945    free_extent_map(em);
2946    return sector;
2947}
2948
2949int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2950        __u64 start, __u64 len, get_extent_t *get_extent)
2951{
2952    int ret;
2953    u64 off = start;
2954    u64 max = start + len;
2955    u32 flags = 0;
2956    u64 disko = 0;
2957    struct extent_map *em = NULL;
2958    struct extent_state *cached_state = NULL;
2959    int end = 0;
2960    u64 em_start = 0, em_len = 0;
2961    unsigned long emflags;
2962    ret = 0;
2963
2964    if (len == 0)
2965        return -EINVAL;
2966
2967    lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2968             &cached_state, GFP_NOFS);
2969    em = get_extent(inode, NULL, 0, off, max - off, 0);
2970    if (!em)
2971        goto out;
2972    if (IS_ERR(em)) {
2973        ret = PTR_ERR(em);
2974        goto out;
2975    }
2976    while (!end) {
2977        off = em->start + em->len;
2978        if (off >= max)
2979            end = 1;
2980
2981        em_start = em->start;
2982        em_len = em->len;
2983
2984        disko = 0;
2985        flags = 0;
2986
2987        if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2988            end = 1;
2989            flags |= FIEMAP_EXTENT_LAST;
2990        } else if (em->block_start == EXTENT_MAP_HOLE) {
2991            flags |= FIEMAP_EXTENT_UNWRITTEN;
2992        } else if (em->block_start == EXTENT_MAP_INLINE) {
2993            flags |= (FIEMAP_EXTENT_DATA_INLINE |
2994                  FIEMAP_EXTENT_NOT_ALIGNED);
2995        } else if (em->block_start == EXTENT_MAP_DELALLOC) {
2996            flags |= (FIEMAP_EXTENT_DELALLOC |
2997                  FIEMAP_EXTENT_UNKNOWN);
2998        } else {
2999            disko = em->block_start;
3000        }
3001        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3002            flags |= FIEMAP_EXTENT_ENCODED;
3003
3004        emflags = em->flags;
3005        free_extent_map(em);
3006        em = NULL;
3007
3008        if (!end) {
3009            em = get_extent(inode, NULL, 0, off, max - off, 0);
3010            if (!em)
3011                goto out;
3012            if (IS_ERR(em)) {
3013                ret = PTR_ERR(em);
3014                goto out;
3015            }
3016            emflags = em->flags;
3017        }
3018        if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
3019            flags |= FIEMAP_EXTENT_LAST;
3020            end = 1;
3021        }
3022
3023        ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3024                    em_len, flags);
3025        if (ret)
3026            goto out_free;
3027    }
3028out_free:
3029    free_extent_map(em);
3030out:
3031    unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3032                 &cached_state, GFP_NOFS);
3033    return ret;
3034}
3035
3036static inline struct page *extent_buffer_page(struct extent_buffer *eb,
3037                          unsigned long i)
3038{
3039    struct page *p;
3040    struct address_space *mapping;
3041
3042    if (i == 0)
3043        return eb->first_page;
3044    i += eb->start >> PAGE_CACHE_SHIFT;
3045    mapping = eb->first_page->mapping;
3046    if (!mapping)
3047        return NULL;
3048
3049    /*
3050     * extent_buffer_page is only called after pinning the page
3051     * by increasing the reference count. So we know the page must
3052     * be in the radix tree.
3053     */
3054    rcu_read_lock();
3055    p = radix_tree_lookup(&mapping->page_tree, i);
3056    rcu_read_unlock();
3057
3058    return p;
3059}
3060
3061static inline unsigned long num_extent_pages(u64 start, u64 len)
3062{
3063    return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3064        (start >> PAGE_CACHE_SHIFT);
3065}
3066
3067static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3068                           u64 start,
3069                           unsigned long len,
3070                           gfp_t mask)
3071{
3072    struct extent_buffer *eb = NULL;
3073#if LEAK_DEBUG
3074    unsigned long flags;
3075#endif
3076
3077    eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3078    eb->start = start;
3079    eb->len = len;
3080    spin_lock_init(&eb->lock);
3081    init_waitqueue_head(&eb->lock_wq);
3082
3083#if LEAK_DEBUG
3084    spin_lock_irqsave(&leak_lock, flags);
3085    list_add(&eb->leak_list, &buffers);
3086    spin_unlock_irqrestore(&leak_lock, flags);
3087#endif
3088    atomic_set(&eb->refs, 1);
3089
3090    return eb;
3091}
3092
3093static void __free_extent_buffer(struct extent_buffer *eb)
3094{
3095#if LEAK_DEBUG
3096    unsigned long flags;
3097    spin_lock_irqsave(&leak_lock, flags);
3098    list_del(&eb->leak_list);
3099    spin_unlock_irqrestore(&leak_lock, flags);
3100#endif
3101    kmem_cache_free(extent_buffer_cache, eb);
3102}
3103
3104struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3105                      u64 start, unsigned long len,
3106                      struct page *page0,
3107                      gfp_t mask)
3108{
3109    unsigned long num_pages = num_extent_pages(start, len);
3110    unsigned long i;
3111    unsigned long index = start >> PAGE_CACHE_SHIFT;
3112    struct extent_buffer *eb;
3113    struct extent_buffer *exists = NULL;
3114    struct page *p;
3115    struct address_space *mapping = tree->mapping;
3116    int uptodate = 1;
3117
3118    spin_lock(&tree->buffer_lock);
3119    eb = buffer_search(tree, start);
3120    if (eb) {
3121        atomic_inc(&eb->refs);
3122        spin_unlock(&tree->buffer_lock);
3123        mark_page_accessed(eb->first_page);
3124        return eb;
3125    }
3126    spin_unlock(&tree->buffer_lock);
3127
3128    eb = __alloc_extent_buffer(tree, start, len, mask);
3129    if (!eb)
3130        return NULL;
3131
3132    if (page0) {
3133        eb->first_page = page0;
3134        i = 1;
3135        index++;
3136        page_cache_get(page0);
3137        mark_page_accessed(page0);
3138        set_page_extent_mapped(page0);
3139        set_page_extent_head(page0, len);
3140        uptodate = PageUptodate(page0);
3141    } else {
3142        i = 0;
3143    }
3144    for (; i < num_pages; i++, index++) {
3145        p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3146        if (!p) {
3147            WARN_ON(1);
3148            goto free_eb;
3149        }
3150        set_page_extent_mapped(p);
3151        mark_page_accessed(p);
3152        if (i == 0) {
3153            eb->first_page = p;
3154            set_page_extent_head(p, len);
3155        } else {
3156            set_page_private(p, EXTENT_PAGE_PRIVATE);
3157        }
3158        if (!PageUptodate(p))
3159            uptodate = 0;
3160        unlock_page(p);
3161    }
3162    if (uptodate)
3163        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3164
3165    spin_lock(&tree->buffer_lock);
3166    exists = buffer_tree_insert(tree, start, &eb->rb_node);
3167    if (exists) {
3168        /* add one reference for the caller */
3169        atomic_inc(&exists->refs);
3170        spin_unlock(&tree->buffer_lock);
3171        goto free_eb;
3172    }
3173    /* add one reference for the tree */
3174    atomic_inc(&eb->refs);
3175    spin_unlock(&tree->buffer_lock);
3176    return eb;
3177
3178free_eb:
3179    if (!atomic_dec_and_test(&eb->refs))
3180        return exists;
3181    for (index = 1; index < i; index++)
3182        page_cache_release(extent_buffer_page(eb, index));
3183    page_cache_release(extent_buffer_page(eb, 0));
3184    __free_extent_buffer(eb);
3185    return exists;
3186}
3187
3188struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3189                     u64 start, unsigned long len,
3190                      gfp_t mask)
3191{
3192    struct extent_buffer *eb;
3193
3194    spin_lock(&tree->buffer_lock);
3195    eb = buffer_search(tree, start);
3196    if (eb)
3197        atomic_inc(&eb->refs);
3198    spin_unlock(&tree->buffer_lock);
3199
3200    if (eb)
3201        mark_page_accessed(eb->first_page);
3202
3203    return eb;
3204}
3205
3206void free_extent_buffer(struct extent_buffer *eb)
3207{
3208    if (!eb)
3209        return;
3210
3211    if (!atomic_dec_and_test(&eb->refs))
3212        return;
3213
3214    WARN_ON(1);
3215}
3216
3217int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3218                  struct extent_buffer *eb)
3219{
3220    unsigned long i;
3221    unsigned long num_pages;
3222    struct page *page;
3223
3224    num_pages = num_extent_pages(eb->start, eb->len);
3225
3226    for (i = 0; i < num_pages; i++) {
3227        page = extent_buffer_page(eb, i);
3228        if (!PageDirty(page))
3229            continue;
3230
3231        lock_page(page);
3232        if (i == 0)
3233            set_page_extent_head(page, eb->len);
3234        else
3235            set_page_private(page, EXTENT_PAGE_PRIVATE);
3236
3237        clear_page_dirty_for_io(page);
3238        spin_lock_irq(&page->mapping->tree_lock);
3239        if (!PageDirty(page)) {
3240            radix_tree_tag_clear(&page->mapping->page_tree,
3241                        page_index(page),
3242                        PAGECACHE_TAG_DIRTY);
3243        }
3244        spin_unlock_irq(&page->mapping->tree_lock);
3245        unlock_page(page);
3246    }
3247    return 0;
3248}
3249
3250int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3251                    struct extent_buffer *eb)
3252{
3253    return wait_on_extent_writeback(tree, eb->start,
3254                    eb->start + eb->len - 1);
3255}
3256
3257int set_extent_buffer_dirty(struct extent_io_tree *tree,
3258                 struct extent_buffer *eb)
3259{
3260    unsigned long i;
3261    unsigned long num_pages;
3262    int was_dirty = 0;
3263
3264    was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3265    num_pages = num_extent_pages(eb->start, eb->len);
3266    for (i = 0; i < num_pages; i++)
3267        __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3268    return was_dirty;
3269}
3270
3271int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3272                struct extent_buffer *eb,
3273                struct extent_state **cached_state)
3274{
3275    unsigned long i;
3276    struct page *page;
3277    unsigned long num_pages;
3278
3279    num_pages = num_extent_pages(eb->start, eb->len);
3280    clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3281
3282    clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3283                  cached_state, GFP_NOFS);
3284    for (i = 0; i < num_pages; i++) {
3285        page = extent_buffer_page(eb, i);
3286        if (page)
3287            ClearPageUptodate(page);
3288    }
3289    return 0;
3290}
3291
3292int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3293                struct extent_buffer *eb)
3294{
3295    unsigned long i;
3296    struct page *page;
3297    unsigned long num_pages;
3298
3299    num_pages = num_extent_pages(eb->start, eb->len);
3300
3301    set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3302                GFP_NOFS);
3303    for (i = 0; i < num_pages; i++) {
3304        page = extent_buffer_page(eb, i);
3305        if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3306            ((i == num_pages - 1) &&
3307             ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3308            check_page_uptodate(tree, page);
3309            continue;
3310        }
3311        SetPageUptodate(page);
3312    }
3313    return 0;
3314}
3315
3316int extent_range_uptodate(struct extent_io_tree *tree,
3317              u64 start, u64 end)
3318{
3319    struct page *page;
3320    int ret;
3321    int pg_uptodate = 1;
3322    int uptodate;
3323    unsigned long index;
3324
3325    ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
3326    if (ret)
3327        return 1;
3328    while (start <= end) {
3329        index = start >> PAGE_CACHE_SHIFT;
3330        page = find_get_page(tree->mapping, index);
3331        uptodate = PageUptodate(page);
3332        page_cache_release(page);
3333        if (!uptodate) {
3334            pg_uptodate = 0;
3335            break;
3336        }
3337        start += PAGE_CACHE_SIZE;
3338    }
3339    return pg_uptodate;
3340}
3341
3342int extent_buffer_uptodate(struct extent_io_tree *tree,
3343               struct extent_buffer *eb,
3344               struct extent_state *cached_state)
3345{
3346    int ret = 0;
3347    unsigned long num_pages;
3348    unsigned long i;
3349    struct page *page;
3350    int pg_uptodate = 1;
3351
3352    if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3353        return 1;
3354
3355    ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3356               EXTENT_UPTODATE, 1, cached_state);
3357    if (ret)
3358        return ret;
3359
3360    num_pages = num_extent_pages(eb->start, eb->len);
3361    for (i = 0; i < num_pages; i++) {
3362        page = extent_buffer_page(eb, i);
3363        if (!PageUptodate(page)) {
3364            pg_uptodate = 0;
3365            break;
3366        }
3367    }
3368    return pg_uptodate;
3369}
3370
3371int read_extent_buffer_pages(struct extent_io_tree *tree,
3372                 struct extent_buffer *eb,
3373                 u64 start, int wait,
3374                 get_extent_t *get_extent, int mirror_num)
3375{
3376    unsigned long i;
3377    unsigned long start_i;
3378    struct page *page;
3379    int err;
3380    int ret = 0;
3381    int locked_pages = 0;
3382    int all_uptodate = 1;
3383    int inc_all_pages = 0;
3384    unsigned long num_pages;
3385    struct bio *bio = NULL;
3386    unsigned long bio_flags = 0;
3387
3388    if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3389        return 0;
3390
3391    if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3392               EXTENT_UPTODATE, 1, NULL)) {
3393        return 0;
3394    }
3395
3396    if (start) {
3397        WARN_ON(start < eb->start);
3398        start_i = (start >> PAGE_CACHE_SHIFT) -
3399            (eb->start >> PAGE_CACHE_SHIFT);
3400    } else {
3401        start_i = 0;
3402    }
3403
3404    num_pages = num_extent_pages(eb->start, eb->len);
3405    for (i = start_i; i < num_pages; i++) {
3406        page = extent_buffer_page(eb, i);
3407        if (!wait) {
3408            if (!trylock_page(page))
3409                goto unlock_exit;
3410        } else {
3411            lock_page(page);
3412        }
3413        locked_pages++;
3414        if (!PageUptodate(page))
3415            all_uptodate = 0;
3416    }
3417    if (all_uptodate) {
3418        if (start_i == 0)
3419            set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3420        goto unlock_exit;
3421    }
3422
3423    for (i = start_i; i < num_pages; i++) {
3424        page = extent_buffer_page(eb, i);
3425        if (inc_all_pages)
3426            page_cache_get(page);
3427        if (!PageUptodate(page)) {
3428            if (start_i == 0)
3429                inc_all_pages = 1;
3430            ClearPageError(page);
3431            err = __extent_read_full_page(tree, page,
3432                              get_extent, &bio,
3433                              mirror_num, &bio_flags);
3434            if (err)
3435                ret = err;
3436        } else {
3437            unlock_page(page);
3438        }
3439    }
3440
3441    if (bio)
3442        submit_one_bio(READ, bio, mirror_num, bio_flags);
3443
3444    if (ret || !wait)
3445        return ret;
3446
3447    for (i = start_i; i < num_pages; i++) {
3448        page = extent_buffer_page(eb, i);
3449        wait_on_page_locked(page);
3450        if (!PageUptodate(page))
3451            ret = -EIO;
3452    }
3453
3454    if (!ret)
3455        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3456    return ret;
3457
3458unlock_exit:
3459    i = start_i;
3460    while (locked_pages > 0) {
3461        page = extent_buffer_page(eb, i);
3462        i++;
3463        unlock_page(page);
3464        locked_pages--;
3465    }
3466    return ret;
3467}
3468
3469void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3470            unsigned long start,
3471            unsigned long len)
3472{
3473    size_t cur;
3474    size_t offset;
3475    struct page *page;
3476    char *kaddr;
3477    char *dst = (char *)dstv;
3478    size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3479    unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3480
3481    WARN_ON(start > eb->len);
3482    WARN_ON(start + len > eb->start + eb->len);
3483
3484    offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3485
3486    while (len > 0) {
3487        page = extent_buffer_page(eb, i);
3488
3489        cur = min(len, (PAGE_CACHE_SIZE - offset));
3490        kaddr = kmap_atomic(page, KM_USER1);
3491        memcpy(dst, kaddr + offset, cur);
3492        kunmap_atomic(kaddr, KM_USER1);
3493
3494        dst += cur;
3495        len -= cur;
3496        offset = 0;
3497        i++;
3498    }
3499}
3500
3501int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3502                   unsigned long min_len, char **token, char **map,
3503                   unsigned long *map_start,
3504                   unsigned long *map_len, int km)
3505{
3506    size_t offset = start & (PAGE_CACHE_SIZE - 1);
3507    char *kaddr;
3508    struct page *p;
3509    size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3510    unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3511    unsigned long end_i = (start_offset + start + min_len - 1) >>
3512        PAGE_CACHE_SHIFT;
3513
3514    if (i != end_i)
3515        return -EINVAL;
3516
3517    if (i == 0) {
3518        offset = start_offset;
3519        *map_start = 0;
3520    } else {
3521        offset = 0;
3522        *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3523    }
3524
3525    if (start + min_len > eb->len) {
3526        printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3527               "wanted %lu %lu\n", (unsigned long long)eb->start,
3528               eb->len, start, min_len);
3529        WARN_ON(1);
3530    }
3531
3532    p = extent_buffer_page(eb, i);
3533    kaddr = kmap_atomic(p, km);
3534    *token = kaddr;
3535    *map = kaddr + offset;
3536    *map_len = PAGE_CACHE_SIZE - offset;
3537    return 0;
3538}
3539
3540int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3541              unsigned long min_len,
3542              char **token, char **map,
3543              unsigned long *map_start,
3544              unsigned long *map_len, int km)
3545{
3546    int err;
3547    int save = 0;
3548    if (eb->map_token) {
3549        unmap_extent_buffer(eb, eb->map_token, km);
3550        eb->map_token = NULL;
3551        save = 1;
3552    }
3553    err = map_private_extent_buffer(eb, start, min_len, token, map,
3554                       map_start, map_len, km);
3555    if (!err && save) {
3556        eb->map_token = *token;
3557        eb->kaddr = *map;
3558        eb->map_start = *map_start;
3559        eb->map_len = *map_len;
3560    }
3561    return err;
3562}
3563
3564void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3565{
3566    kunmap_atomic(token, km);
3567}
3568
3569int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3570              unsigned long start,
3571              unsigned long len)
3572{
3573    size_t cur;
3574    size_t offset;
3575    struct page *page;
3576    char *kaddr;
3577    char *ptr = (char *)ptrv;
3578    size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3579    unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3580    int ret = 0;
3581
3582    WARN_ON(start > eb->len);
3583    WARN_ON(start + len > eb->start + eb->len);
3584
3585    offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3586
3587    while (len > 0) {
3588        page = extent_buffer_page(eb, i);
3589
3590        cur = min(len, (PAGE_CACHE_SIZE - offset));
3591
3592        kaddr = kmap_atomic(page, KM_USER0);
3593        ret = memcmp(ptr, kaddr + offset, cur);
3594        kunmap_atomic(kaddr, KM_USER0);
3595        if (ret)
3596            break;
3597
3598        ptr += cur;
3599        len -= cur;
3600        offset = 0;
3601        i++;
3602    }
3603    return ret;
3604}
3605
3606void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3607             unsigned long start, unsigned long len)
3608{
3609    size_t cur;
3610    size_t offset;
3611    struct page *page;
3612    char *kaddr;
3613    char *src = (char *)srcv;
3614    size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3615    unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3616
3617    WARN_ON(start > eb->len);
3618    WARN_ON(start + len > eb->start + eb->len);
3619
3620    offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3621
3622    while (len > 0) {
3623        page = extent_buffer_page(eb, i);
3624        WARN_ON(!PageUptodate(page));
3625
3626        cur = min(len, PAGE_CACHE_SIZE - offset);
3627        kaddr = kmap_atomic(page, KM_USER1);
3628        memcpy(kaddr + offset, src, cur);
3629        kunmap_atomic(kaddr, KM_USER1);
3630
3631        src += cur;
3632        len -= cur;
3633        offset = 0;
3634        i++;
3635    }
3636}
3637
3638void memset_extent_buffer(struct extent_buffer *eb, char c,
3639              unsigned long start, unsigned long len)
3640{
3641    size_t cur;
3642    size_t offset;
3643    struct page *page;
3644    char *kaddr;
3645    size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3646    unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3647
3648    WARN_ON(start > eb->len);
3649    WARN_ON(start + len > eb->start + eb->len);
3650
3651    offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3652
3653    while (len > 0) {
3654        page = extent_buffer_page(eb, i);
3655        WARN_ON(!PageUptodate(page));
3656
3657        cur = min(len, PAGE_CACHE_SIZE - offset);
3658        kaddr = kmap_atomic(page, KM_USER0);
3659        memset(kaddr + offset, c, cur);
3660        kunmap_atomic(kaddr, KM_USER0);
3661
3662        len -= cur;
3663        offset = 0;
3664        i++;
3665    }
3666}
3667
3668void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3669            unsigned long dst_offset, unsigned long src_offset,
3670            unsigned long len)
3671{
3672    u64 dst_len = dst->len;
3673    size_t cur;
3674    size_t offset;
3675    struct page *page;
3676    char *kaddr;
3677    size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3678    unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3679
3680    WARN_ON(src->len != dst_len);
3681
3682    offset = (start_offset + dst_offset) &
3683        ((unsigned long)PAGE_CACHE_SIZE - 1);
3684
3685    while (len > 0) {
3686        page = extent_buffer_page(dst, i);
3687        WARN_ON(!PageUptodate(page));
3688
3689        cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3690
3691        kaddr = kmap_atomic(page, KM_USER0);
3692        read_extent_buffer(src, kaddr + offset, src_offset, cur);
3693        kunmap_atomic(kaddr, KM_USER0);
3694
3695        src_offset += cur;
3696        len -= cur;
3697        offset = 0;
3698        i++;
3699    }
3700}
3701
3702static void move_pages(struct page *dst_page, struct page *src_page,
3703               unsigned long dst_off, unsigned long src_off,
3704               unsigned long len)
3705{
3706    char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3707    if (dst_page == src_page) {
3708        memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3709    } else {
3710        char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3711        char *p = dst_kaddr + dst_off + len;
3712        char *s = src_kaddr + src_off + len;
3713
3714        while (len--)
3715            *--p = *--s;
3716
3717        kunmap_atomic(src_kaddr, KM_USER1);
3718    }
3719    kunmap_atomic(dst_kaddr, KM_USER0);
3720}
3721
3722static void copy_pages(struct page *dst_page, struct page *src_page,
3723               unsigned long dst_off, unsigned long src_off,
3724               unsigned long len)
3725{
3726    char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3727    char *src_kaddr;
3728
3729    if (dst_page != src_page)
3730        src_kaddr = kmap_atomic(src_page, KM_USER1);
3731    else
3732        src_kaddr = dst_kaddr;
3733
3734    memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3735    kunmap_atomic(dst_kaddr, KM_USER0);
3736    if (dst_page != src_page)
3737        kunmap_atomic(src_kaddr, KM_USER1);
3738}
3739
3740void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3741               unsigned long src_offset, unsigned long len)
3742{
3743    size_t cur;
3744    size_t dst_off_in_page;
3745    size_t src_off_in_page;
3746    size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3747    unsigned long dst_i;
3748    unsigned long src_i;
3749
3750    if (src_offset + len > dst->len) {
3751        printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3752               "len %lu dst len %lu\n", src_offset, len, dst->len);
3753        BUG_ON(1);
3754    }
3755    if (dst_offset + len > dst->len) {
3756        printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3757               "len %lu dst len %lu\n", dst_offset, len, dst->len);
3758        BUG_ON(1);
3759    }
3760
3761    while (len > 0) {
3762        dst_off_in_page = (start_offset + dst_offset) &
3763            ((unsigned long)PAGE_CACHE_SIZE - 1);
3764        src_off_in_page = (start_offset + src_offset) &
3765            ((unsigned long)PAGE_CACHE_SIZE - 1);
3766
3767        dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3768        src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3769
3770        cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3771                           src_off_in_page));
3772        cur = min_t(unsigned long, cur,
3773            (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3774
3775        copy_pages(extent_buffer_page(dst, dst_i),
3776               extent_buffer_page(dst, src_i),
3777               dst_off_in_page, src_off_in_page, cur);
3778
3779        src_offset += cur;
3780        dst_offset += cur;
3781        len -= cur;
3782    }
3783}
3784
3785void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3786               unsigned long src_offset, unsigned long len)
3787{
3788    size_t cur;
3789    size_t dst_off_in_page;
3790    size_t src_off_in_page;
3791    unsigned long dst_end = dst_offset + len - 1;
3792    unsigned long src_end = src_offset + len - 1;
3793    size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3794    unsigned long dst_i;
3795    unsigned long src_i;
3796
3797    if (src_offset + len > dst->len) {
3798        printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3799               "len %lu len %lu\n", src_offset, len, dst->len);
3800        BUG_ON(1);
3801    }
3802    if (dst_offset + len > dst->len) {
3803        printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3804               "len %lu len %lu\n", dst_offset, len, dst->len);
3805        BUG_ON(1);
3806    }
3807    if (dst_offset < src_offset) {
3808        memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3809        return;
3810    }
3811    while (len > 0) {
3812        dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3813        src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3814
3815        dst_off_in_page = (start_offset + dst_end) &
3816            ((unsigned long)PAGE_CACHE_SIZE - 1);
3817        src_off_in_page = (start_offset + src_end) &
3818            ((unsigned long)PAGE_CACHE_SIZE - 1);
3819
3820        cur = min_t(unsigned long, len, src_off_in_page + 1);
3821        cur = min(cur, dst_off_in_page + 1);
3822        move_pages(extent_buffer_page(dst, dst_i),
3823               extent_buffer_page(dst, src_i),
3824               dst_off_in_page - cur + 1,
3825               src_off_in_page - cur + 1, cur);
3826
3827        dst_end -= cur;
3828        src_end -= cur;
3829        len -= cur;
3830    }
3831}
3832
3833int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3834{
3835    u64 start = page_offset(page);
3836    struct extent_buffer *eb;
3837    int ret = 1;
3838    unsigned long i;
3839    unsigned long num_pages;
3840
3841    spin_lock(&tree->buffer_lock);
3842    eb = buffer_search(tree, start);
3843    if (!eb)
3844        goto out;
3845
3846    if (atomic_read(&eb->refs) > 1) {
3847        ret = 0;
3848        goto out;
3849    }
3850    if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3851        ret = 0;
3852        goto out;
3853    }
3854    /* at this point we can safely release the extent buffer */
3855    num_pages = num_extent_pages(eb->start, eb->len);
3856    for (i = 0; i < num_pages; i++)
3857        page_cache_release(extent_buffer_page(eb, i));
3858    rb_erase(&eb->rb_node, &tree->buffer);
3859    __free_extent_buffer(eb);
3860out:
3861    spin_unlock(&tree->buffer_lock);
3862    return ret;
3863}
3864

Archive Download this file



interactive