Root/fs/ext3/balloc.c

1/*
2 * linux/fs/ext3/balloc.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10 * Big-endian to little-endian byte-swapping/bitmaps by
11 * David S. Miller (davem@caip.rutgers.edu), 1995
12 */
13
14#include <linux/time.h>
15#include <linux/capability.h>
16#include <linux/fs.h>
17#include <linux/slab.h>
18#include <linux/jbd.h>
19#include <linux/ext3_fs.h>
20#include <linux/ext3_jbd.h>
21#include <linux/quotaops.h>
22#include <linux/buffer_head.h>
23#include <linux/blkdev.h>
24
25/*
26 * balloc.c contains the blocks allocation and deallocation routines
27 */
28
29/*
30 * The free blocks are managed by bitmaps. A file system contains several
31 * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
32 * block for inodes, N blocks for the inode table and data blocks.
33 *
34 * The file system contains group descriptors which are located after the
35 * super block. Each descriptor contains the number of the bitmap block and
36 * the free blocks count in the block. The descriptors are loaded in memory
37 * when a file system is mounted (see ext3_fill_super).
38 */
39
40
41#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
42
43/*
44 * Calculate the block group number and offset, given a block number
45 */
46static void ext3_get_group_no_and_offset(struct super_block *sb,
47    ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
48{
49    struct ext3_super_block *es = EXT3_SB(sb)->s_es;
50
51    blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
52    if (offsetp)
53        *offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
54    if (blockgrpp)
55        *blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
56}
57
58/**
59 * ext3_get_group_desc() -- load group descriptor from disk
60 * @sb: super block
61 * @block_group: given block group
62 * @bh: pointer to the buffer head to store the block
63 * group descriptor
64 */
65struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
66                         unsigned int block_group,
67                         struct buffer_head ** bh)
68{
69    unsigned long group_desc;
70    unsigned long offset;
71    struct ext3_group_desc * desc;
72    struct ext3_sb_info *sbi = EXT3_SB(sb);
73
74    if (block_group >= sbi->s_groups_count) {
75        ext3_error (sb, "ext3_get_group_desc",
76                "block_group >= groups_count - "
77                "block_group = %d, groups_count = %lu",
78                block_group, sbi->s_groups_count);
79
80        return NULL;
81    }
82    smp_rmb();
83
84    group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
85    offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
86    if (!sbi->s_group_desc[group_desc]) {
87        ext3_error (sb, "ext3_get_group_desc",
88                "Group descriptor not loaded - "
89                "block_group = %d, group_desc = %lu, desc = %lu",
90                 block_group, group_desc, offset);
91        return NULL;
92    }
93
94    desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
95    if (bh)
96        *bh = sbi->s_group_desc[group_desc];
97    return desc + offset;
98}
99
100static int ext3_valid_block_bitmap(struct super_block *sb,
101                    struct ext3_group_desc *desc,
102                    unsigned int block_group,
103                    struct buffer_head *bh)
104{
105    ext3_grpblk_t offset;
106    ext3_grpblk_t next_zero_bit;
107    ext3_fsblk_t bitmap_blk;
108    ext3_fsblk_t group_first_block;
109
110    group_first_block = ext3_group_first_block_no(sb, block_group);
111
112    /* check whether block bitmap block number is set */
113    bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
114    offset = bitmap_blk - group_first_block;
115    if (!ext3_test_bit(offset, bh->b_data))
116        /* bad block bitmap */
117        goto err_out;
118
119    /* check whether the inode bitmap block number is set */
120    bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
121    offset = bitmap_blk - group_first_block;
122    if (!ext3_test_bit(offset, bh->b_data))
123        /* bad block bitmap */
124        goto err_out;
125
126    /* check whether the inode table block number is set */
127    bitmap_blk = le32_to_cpu(desc->bg_inode_table);
128    offset = bitmap_blk - group_first_block;
129    next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
130                offset + EXT3_SB(sb)->s_itb_per_group,
131                offset);
132    if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
133        /* good bitmap for inode tables */
134        return 1;
135
136err_out:
137    ext3_error(sb, __func__,
138            "Invalid block bitmap - "
139            "block_group = %d, block = %lu",
140            block_group, bitmap_blk);
141    return 0;
142}
143
144/**
145 * read_block_bitmap()
146 * @sb: super block
147 * @block_group: given block group
148 *
149 * Read the bitmap for a given block_group,and validate the
150 * bits for block/inode/inode tables are set in the bitmaps
151 *
152 * Return buffer_head on success or NULL in case of failure.
153 */
154static struct buffer_head *
155read_block_bitmap(struct super_block *sb, unsigned int block_group)
156{
157    struct ext3_group_desc * desc;
158    struct buffer_head * bh = NULL;
159    ext3_fsblk_t bitmap_blk;
160
161    desc = ext3_get_group_desc(sb, block_group, NULL);
162    if (!desc)
163        return NULL;
164    bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
165    bh = sb_getblk(sb, bitmap_blk);
166    if (unlikely(!bh)) {
167        ext3_error(sb, __func__,
168                "Cannot read block bitmap - "
169                "block_group = %d, block_bitmap = %u",
170                block_group, le32_to_cpu(desc->bg_block_bitmap));
171        return NULL;
172    }
173    if (likely(bh_uptodate_or_lock(bh)))
174        return bh;
175
176    if (bh_submit_read(bh) < 0) {
177        brelse(bh);
178        ext3_error(sb, __func__,
179                "Cannot read block bitmap - "
180                "block_group = %d, block_bitmap = %u",
181                block_group, le32_to_cpu(desc->bg_block_bitmap));
182        return NULL;
183    }
184    ext3_valid_block_bitmap(sb, desc, block_group, bh);
185    /*
186     * file system mounted not to panic on error, continue with corrupt
187     * bitmap
188     */
189    return bh;
190}
191/*
192 * The reservation window structure operations
193 * --------------------------------------------
194 * Operations include:
195 * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
196 *
197 * We use a red-black tree to represent per-filesystem reservation
198 * windows.
199 *
200 */
201
202/**
203 * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
204 * @rb_root: root of per-filesystem reservation rb tree
205 * @verbose: verbose mode
206 * @fn: function which wishes to dump the reservation map
207 *
208 * If verbose is turned on, it will print the whole block reservation
209 * windows(start, end). Otherwise, it will only print out the "bad" windows,
210 * those windows that overlap with their immediate neighbors.
211 */
212#if 1
213static void __rsv_window_dump(struct rb_root *root, int verbose,
214                  const char *fn)
215{
216    struct rb_node *n;
217    struct ext3_reserve_window_node *rsv, *prev;
218    int bad;
219
220restart:
221    n = rb_first(root);
222    bad = 0;
223    prev = NULL;
224
225    printk("Block Allocation Reservation Windows Map (%s):\n", fn);
226    while (n) {
227        rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
228        if (verbose)
229            printk("reservation window 0x%p "
230                   "start: %lu, end: %lu\n",
231                   rsv, rsv->rsv_start, rsv->rsv_end);
232        if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
233            printk("Bad reservation %p (start >= end)\n",
234                   rsv);
235            bad = 1;
236        }
237        if (prev && prev->rsv_end >= rsv->rsv_start) {
238            printk("Bad reservation %p (prev->end >= start)\n",
239                   rsv);
240            bad = 1;
241        }
242        if (bad) {
243            if (!verbose) {
244                printk("Restarting reservation walk in verbose mode\n");
245                verbose = 1;
246                goto restart;
247            }
248        }
249        n = rb_next(n);
250        prev = rsv;
251    }
252    printk("Window map complete.\n");
253    BUG_ON(bad);
254}
255#define rsv_window_dump(root, verbose) \
256    __rsv_window_dump((root), (verbose), __func__)
257#else
258#define rsv_window_dump(root, verbose) do {} while (0)
259#endif
260
261/**
262 * goal_in_my_reservation()
263 * @rsv: inode's reservation window
264 * @grp_goal: given goal block relative to the allocation block group
265 * @group: the current allocation block group
266 * @sb: filesystem super block
267 *
268 * Test if the given goal block (group relative) is within the file's
269 * own block reservation window range.
270 *
271 * If the reservation window is outside the goal allocation group, return 0;
272 * grp_goal (given goal block) could be -1, which means no specific
273 * goal block. In this case, always return 1.
274 * If the goal block is within the reservation window, return 1;
275 * otherwise, return 0;
276 */
277static int
278goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
279            unsigned int group, struct super_block * sb)
280{
281    ext3_fsblk_t group_first_block, group_last_block;
282
283    group_first_block = ext3_group_first_block_no(sb, group);
284    group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
285
286    if ((rsv->_rsv_start > group_last_block) ||
287        (rsv->_rsv_end < group_first_block))
288        return 0;
289    if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
290        || (grp_goal + group_first_block > rsv->_rsv_end)))
291        return 0;
292    return 1;
293}
294
295/**
296 * search_reserve_window()
297 * @rb_root: root of reservation tree
298 * @goal: target allocation block
299 *
300 * Find the reserved window which includes the goal, or the previous one
301 * if the goal is not in any window.
302 * Returns NULL if there are no windows or if all windows start after the goal.
303 */
304static struct ext3_reserve_window_node *
305search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
306{
307    struct rb_node *n = root->rb_node;
308    struct ext3_reserve_window_node *rsv;
309
310    if (!n)
311        return NULL;
312
313    do {
314        rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
315
316        if (goal < rsv->rsv_start)
317            n = n->rb_left;
318        else if (goal > rsv->rsv_end)
319            n = n->rb_right;
320        else
321            return rsv;
322    } while (n);
323    /*
324     * We've fallen off the end of the tree: the goal wasn't inside
325     * any particular node. OK, the previous node must be to one
326     * side of the interval containing the goal. If it's the RHS,
327     * we need to back up one.
328     */
329    if (rsv->rsv_start > goal) {
330        n = rb_prev(&rsv->rsv_node);
331        rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
332    }
333    return rsv;
334}
335
336/**
337 * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
338 * @sb: super block
339 * @rsv: reservation window to add
340 *
341 * Must be called with rsv_lock hold.
342 */
343void ext3_rsv_window_add(struct super_block *sb,
344            struct ext3_reserve_window_node *rsv)
345{
346    struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
347    struct rb_node *node = &rsv->rsv_node;
348    ext3_fsblk_t start = rsv->rsv_start;
349
350    struct rb_node ** p = &root->rb_node;
351    struct rb_node * parent = NULL;
352    struct ext3_reserve_window_node *this;
353
354    while (*p)
355    {
356        parent = *p;
357        this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
358
359        if (start < this->rsv_start)
360            p = &(*p)->rb_left;
361        else if (start > this->rsv_end)
362            p = &(*p)->rb_right;
363        else {
364            rsv_window_dump(root, 1);
365            BUG();
366        }
367    }
368
369    rb_link_node(node, parent, p);
370    rb_insert_color(node, root);
371}
372
373/**
374 * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
375 * @sb: super block
376 * @rsv: reservation window to remove
377 *
378 * Mark the block reservation window as not allocated, and unlink it
379 * from the filesystem reservation window rb tree. Must be called with
380 * rsv_lock hold.
381 */
382static void rsv_window_remove(struct super_block *sb,
383                  struct ext3_reserve_window_node *rsv)
384{
385    rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
386    rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
387    rsv->rsv_alloc_hit = 0;
388    rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
389}
390
391/*
392 * rsv_is_empty() -- Check if the reservation window is allocated.
393 * @rsv: given reservation window to check
394 *
395 * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
396 */
397static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
398{
399    /* a valid reservation end block could not be 0 */
400    return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
401}
402
403/**
404 * ext3_init_block_alloc_info()
405 * @inode: file inode structure
406 *
407 * Allocate and initialize the reservation window structure, and
408 * link the window to the ext3 inode structure at last
409 *
410 * The reservation window structure is only dynamically allocated
411 * and linked to ext3 inode the first time the open file
412 * needs a new block. So, before every ext3_new_block(s) call, for
413 * regular files, we should check whether the reservation window
414 * structure exists or not. In the latter case, this function is called.
415 * Fail to do so will result in block reservation being turned off for that
416 * open file.
417 *
418 * This function is called from ext3_get_blocks_handle(), also called
419 * when setting the reservation window size through ioctl before the file
420 * is open for write (needs block allocation).
421 *
422 * Needs truncate_mutex protection prior to call this function.
423 */
424void ext3_init_block_alloc_info(struct inode *inode)
425{
426    struct ext3_inode_info *ei = EXT3_I(inode);
427    struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
428    struct super_block *sb = inode->i_sb;
429
430    block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
431    if (block_i) {
432        struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
433
434        rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
435        rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
436
437        /*
438         * if filesystem is mounted with NORESERVATION, the goal
439         * reservation window size is set to zero to indicate
440         * block reservation is off
441         */
442        if (!test_opt(sb, RESERVATION))
443            rsv->rsv_goal_size = 0;
444        else
445            rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
446        rsv->rsv_alloc_hit = 0;
447        block_i->last_alloc_logical_block = 0;
448        block_i->last_alloc_physical_block = 0;
449    }
450    ei->i_block_alloc_info = block_i;
451}
452
453/**
454 * ext3_discard_reservation()
455 * @inode: inode
456 *
457 * Discard(free) block reservation window on last file close, or truncate
458 * or at last iput().
459 *
460 * It is being called in three cases:
461 * ext3_release_file(): last writer close the file
462 * ext3_clear_inode(): last iput(), when nobody link to this file.
463 * ext3_truncate(): when the block indirect map is about to change.
464 *
465 */
466void ext3_discard_reservation(struct inode *inode)
467{
468    struct ext3_inode_info *ei = EXT3_I(inode);
469    struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
470    struct ext3_reserve_window_node *rsv;
471    spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
472
473    if (!block_i)
474        return;
475
476    rsv = &block_i->rsv_window_node;
477    if (!rsv_is_empty(&rsv->rsv_window)) {
478        spin_lock(rsv_lock);
479        if (!rsv_is_empty(&rsv->rsv_window))
480            rsv_window_remove(inode->i_sb, rsv);
481        spin_unlock(rsv_lock);
482    }
483}
484
485/**
486 * ext3_free_blocks_sb() -- Free given blocks and update quota
487 * @handle: handle to this transaction
488 * @sb: super block
489 * @block: start physcial block to free
490 * @count: number of blocks to free
491 * @pdquot_freed_blocks: pointer to quota
492 */
493void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
494             ext3_fsblk_t block, unsigned long count,
495             unsigned long *pdquot_freed_blocks)
496{
497    struct buffer_head *bitmap_bh = NULL;
498    struct buffer_head *gd_bh;
499    unsigned long block_group;
500    ext3_grpblk_t bit;
501    unsigned long i;
502    unsigned long overflow;
503    struct ext3_group_desc * desc;
504    struct ext3_super_block * es;
505    struct ext3_sb_info *sbi;
506    int err = 0, ret;
507    ext3_grpblk_t group_freed;
508
509    *pdquot_freed_blocks = 0;
510    sbi = EXT3_SB(sb);
511    es = sbi->s_es;
512    if (block < le32_to_cpu(es->s_first_data_block) ||
513        block + count < block ||
514        block + count > le32_to_cpu(es->s_blocks_count)) {
515        ext3_error (sb, "ext3_free_blocks",
516                "Freeing blocks not in datazone - "
517                "block = "E3FSBLK", count = %lu", block, count);
518        goto error_return;
519    }
520
521    ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
522
523do_more:
524    overflow = 0;
525    block_group = (block - le32_to_cpu(es->s_first_data_block)) /
526              EXT3_BLOCKS_PER_GROUP(sb);
527    bit = (block - le32_to_cpu(es->s_first_data_block)) %
528              EXT3_BLOCKS_PER_GROUP(sb);
529    /*
530     * Check to see if we are freeing blocks across a group
531     * boundary.
532     */
533    if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
534        overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
535        count -= overflow;
536    }
537    brelse(bitmap_bh);
538    bitmap_bh = read_block_bitmap(sb, block_group);
539    if (!bitmap_bh)
540        goto error_return;
541    desc = ext3_get_group_desc (sb, block_group, &gd_bh);
542    if (!desc)
543        goto error_return;
544
545    if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
546        in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
547        in_range (block, le32_to_cpu(desc->bg_inode_table),
548              sbi->s_itb_per_group) ||
549        in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
550              sbi->s_itb_per_group)) {
551        ext3_error (sb, "ext3_free_blocks",
552                "Freeing blocks in system zones - "
553                "Block = "E3FSBLK", count = %lu",
554                block, count);
555        goto error_return;
556    }
557
558    /*
559     * We are about to start releasing blocks in the bitmap,
560     * so we need undo access.
561     */
562    /* @@@ check errors */
563    BUFFER_TRACE(bitmap_bh, "getting undo access");
564    err = ext3_journal_get_undo_access(handle, bitmap_bh);
565    if (err)
566        goto error_return;
567
568    /*
569     * We are about to modify some metadata. Call the journal APIs
570     * to unshare ->b_data if a currently-committing transaction is
571     * using it
572     */
573    BUFFER_TRACE(gd_bh, "get_write_access");
574    err = ext3_journal_get_write_access(handle, gd_bh);
575    if (err)
576        goto error_return;
577
578    jbd_lock_bh_state(bitmap_bh);
579
580    for (i = 0, group_freed = 0; i < count; i++) {
581        /*
582         * An HJ special. This is expensive...
583         */
584#ifdef CONFIG_JBD_DEBUG
585        jbd_unlock_bh_state(bitmap_bh);
586        {
587            struct buffer_head *debug_bh;
588            debug_bh = sb_find_get_block(sb, block + i);
589            if (debug_bh) {
590                BUFFER_TRACE(debug_bh, "Deleted!");
591                if (!bh2jh(bitmap_bh)->b_committed_data)
592                    BUFFER_TRACE(debug_bh,
593                        "No committed data in bitmap");
594                BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
595                __brelse(debug_bh);
596            }
597        }
598        jbd_lock_bh_state(bitmap_bh);
599#endif
600        if (need_resched()) {
601            jbd_unlock_bh_state(bitmap_bh);
602            cond_resched();
603            jbd_lock_bh_state(bitmap_bh);
604        }
605        /* @@@ This prevents newly-allocated data from being
606         * freed and then reallocated within the same
607         * transaction.
608         *
609         * Ideally we would want to allow that to happen, but to
610         * do so requires making journal_forget() capable of
611         * revoking the queued write of a data block, which
612         * implies blocking on the journal lock. *forget()
613         * cannot block due to truncate races.
614         *
615         * Eventually we can fix this by making journal_forget()
616         * return a status indicating whether or not it was able
617         * to revoke the buffer. On successful revoke, it is
618         * safe not to set the allocation bit in the committed
619         * bitmap, because we know that there is no outstanding
620         * activity on the buffer any more and so it is safe to
621         * reallocate it.
622         */
623        BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
624        J_ASSERT_BH(bitmap_bh,
625                bh2jh(bitmap_bh)->b_committed_data != NULL);
626        ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
627                bh2jh(bitmap_bh)->b_committed_data);
628
629        /*
630         * We clear the bit in the bitmap after setting the committed
631         * data bit, because this is the reverse order to that which
632         * the allocator uses.
633         */
634        BUFFER_TRACE(bitmap_bh, "clear bit");
635        if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
636                        bit + i, bitmap_bh->b_data)) {
637            jbd_unlock_bh_state(bitmap_bh);
638            ext3_error(sb, __func__,
639                "bit already cleared for block "E3FSBLK,
640                 block + i);
641            jbd_lock_bh_state(bitmap_bh);
642            BUFFER_TRACE(bitmap_bh, "bit already cleared");
643        } else {
644            group_freed++;
645        }
646    }
647    jbd_unlock_bh_state(bitmap_bh);
648
649    spin_lock(sb_bgl_lock(sbi, block_group));
650    le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
651    spin_unlock(sb_bgl_lock(sbi, block_group));
652    percpu_counter_add(&sbi->s_freeblocks_counter, count);
653
654    /* We dirtied the bitmap block */
655    BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
656    err = ext3_journal_dirty_metadata(handle, bitmap_bh);
657
658    /* And the group descriptor block */
659    BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
660    ret = ext3_journal_dirty_metadata(handle, gd_bh);
661    if (!err) err = ret;
662    *pdquot_freed_blocks += group_freed;
663
664    if (overflow && !err) {
665        block += count;
666        count = overflow;
667        goto do_more;
668    }
669
670error_return:
671    brelse(bitmap_bh);
672    ext3_std_error(sb, err);
673    return;
674}
675
676/**
677 * ext3_free_blocks() -- Free given blocks and update quota
678 * @handle: handle for this transaction
679 * @inode: inode
680 * @block: start physical block to free
681 * @count: number of blocks to count
682 */
683void ext3_free_blocks(handle_t *handle, struct inode *inode,
684            ext3_fsblk_t block, unsigned long count)
685{
686    struct super_block * sb;
687    unsigned long dquot_freed_blocks;
688
689    sb = inode->i_sb;
690    if (!sb) {
691        printk ("ext3_free_blocks: nonexistent device");
692        return;
693    }
694    ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
695    if (dquot_freed_blocks)
696        dquot_free_block(inode, dquot_freed_blocks);
697    return;
698}
699
700/**
701 * ext3_test_allocatable()
702 * @nr: given allocation block group
703 * @bh: bufferhead contains the bitmap of the given block group
704 *
705 * For ext3 allocations, we must not reuse any blocks which are
706 * allocated in the bitmap buffer's "last committed data" copy. This
707 * prevents deletes from freeing up the page for reuse until we have
708 * committed the delete transaction.
709 *
710 * If we didn't do this, then deleting something and reallocating it as
711 * data would allow the old block to be overwritten before the
712 * transaction committed (because we force data to disk before commit).
713 * This would lead to corruption if we crashed between overwriting the
714 * data and committing the delete.
715 *
716 * @@@ We may want to make this allocation behaviour conditional on
717 * data-writes at some point, and disable it for metadata allocations or
718 * sync-data inodes.
719 */
720static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
721{
722    int ret;
723    struct journal_head *jh = bh2jh(bh);
724
725    if (ext3_test_bit(nr, bh->b_data))
726        return 0;
727
728    jbd_lock_bh_state(bh);
729    if (!jh->b_committed_data)
730        ret = 1;
731    else
732        ret = !ext3_test_bit(nr, jh->b_committed_data);
733    jbd_unlock_bh_state(bh);
734    return ret;
735}
736
737/**
738 * bitmap_search_next_usable_block()
739 * @start: the starting block (group relative) of the search
740 * @bh: bufferhead contains the block group bitmap
741 * @maxblocks: the ending block (group relative) of the reservation
742 *
743 * The bitmap search --- search forward alternately through the actual
744 * bitmap on disk and the last-committed copy in journal, until we find a
745 * bit free in both bitmaps.
746 */
747static ext3_grpblk_t
748bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
749                    ext3_grpblk_t maxblocks)
750{
751    ext3_grpblk_t next;
752    struct journal_head *jh = bh2jh(bh);
753
754    while (start < maxblocks) {
755        next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
756        if (next >= maxblocks)
757            return -1;
758        if (ext3_test_allocatable(next, bh))
759            return next;
760        jbd_lock_bh_state(bh);
761        if (jh->b_committed_data)
762            start = ext3_find_next_zero_bit(jh->b_committed_data,
763                            maxblocks, next);
764        jbd_unlock_bh_state(bh);
765    }
766    return -1;
767}
768
769/**
770 * find_next_usable_block()
771 * @start: the starting block (group relative) to find next
772 * allocatable block in bitmap.
773 * @bh: bufferhead contains the block group bitmap
774 * @maxblocks: the ending block (group relative) for the search
775 *
776 * Find an allocatable block in a bitmap. We honor both the bitmap and
777 * its last-committed copy (if that exists), and perform the "most
778 * appropriate allocation" algorithm of looking for a free block near
779 * the initial goal; then for a free byte somewhere in the bitmap; then
780 * for any free bit in the bitmap.
781 */
782static ext3_grpblk_t
783find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
784            ext3_grpblk_t maxblocks)
785{
786    ext3_grpblk_t here, next;
787    char *p, *r;
788
789    if (start > 0) {
790        /*
791         * The goal was occupied; search forward for a free
792         * block within the next XX blocks.
793         *
794         * end_goal is more or less random, but it has to be
795         * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
796         * next 64-bit boundary is simple..
797         */
798        ext3_grpblk_t end_goal = (start + 63) & ~63;
799        if (end_goal > maxblocks)
800            end_goal = maxblocks;
801        here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
802        if (here < end_goal && ext3_test_allocatable(here, bh))
803            return here;
804        ext3_debug("Bit not found near goal\n");
805    }
806
807    here = start;
808    if (here < 0)
809        here = 0;
810
811    p = bh->b_data + (here >> 3);
812    r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
813    next = (r - bh->b_data) << 3;
814
815    if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
816        return next;
817
818    /*
819     * The bitmap search --- search forward alternately through the actual
820     * bitmap and the last-committed copy until we find a bit free in
821     * both
822     */
823    here = bitmap_search_next_usable_block(here, bh, maxblocks);
824    return here;
825}
826
827/**
828 * claim_block()
829 * @lock: the spin lock for this block group
830 * @block: the free block (group relative) to allocate
831 * @bh: the buffer_head contains the block group bitmap
832 *
833 * We think we can allocate this block in this bitmap. Try to set the bit.
834 * If that succeeds then check that nobody has allocated and then freed the
835 * block since we saw that is was not marked in b_committed_data. If it _was_
836 * allocated and freed then clear the bit in the bitmap again and return
837 * zero (failure).
838 */
839static inline int
840claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
841{
842    struct journal_head *jh = bh2jh(bh);
843    int ret;
844
845    if (ext3_set_bit_atomic(lock, block, bh->b_data))
846        return 0;
847    jbd_lock_bh_state(bh);
848    if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
849        ext3_clear_bit_atomic(lock, block, bh->b_data);
850        ret = 0;
851    } else {
852        ret = 1;
853    }
854    jbd_unlock_bh_state(bh);
855    return ret;
856}
857
858/**
859 * ext3_try_to_allocate()
860 * @sb: superblock
861 * @handle: handle to this transaction
862 * @group: given allocation block group
863 * @bitmap_bh: bufferhead holds the block bitmap
864 * @grp_goal: given target block within the group
865 * @count: target number of blocks to allocate
866 * @my_rsv: reservation window
867 *
868 * Attempt to allocate blocks within a give range. Set the range of allocation
869 * first, then find the first free bit(s) from the bitmap (within the range),
870 * and at last, allocate the blocks by claiming the found free bit as allocated.
871 *
872 * To set the range of this allocation:
873 * if there is a reservation window, only try to allocate block(s) from the
874 * file's own reservation window;
875 * Otherwise, the allocation range starts from the give goal block, ends at
876 * the block group's last block.
877 *
878 * If we failed to allocate the desired block then we may end up crossing to a
879 * new bitmap. In that case we must release write access to the old one via
880 * ext3_journal_release_buffer(), else we'll run out of credits.
881 */
882static ext3_grpblk_t
883ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
884            struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
885            unsigned long *count, struct ext3_reserve_window *my_rsv)
886{
887    ext3_fsblk_t group_first_block;
888    ext3_grpblk_t start, end;
889    unsigned long num = 0;
890
891    /* we do allocation within the reservation window if we have a window */
892    if (my_rsv) {
893        group_first_block = ext3_group_first_block_no(sb, group);
894        if (my_rsv->_rsv_start >= group_first_block)
895            start = my_rsv->_rsv_start - group_first_block;
896        else
897            /* reservation window cross group boundary */
898            start = 0;
899        end = my_rsv->_rsv_end - group_first_block + 1;
900        if (end > EXT3_BLOCKS_PER_GROUP(sb))
901            /* reservation window crosses group boundary */
902            end = EXT3_BLOCKS_PER_GROUP(sb);
903        if ((start <= grp_goal) && (grp_goal < end))
904            start = grp_goal;
905        else
906            grp_goal = -1;
907    } else {
908        if (grp_goal > 0)
909            start = grp_goal;
910        else
911            start = 0;
912        end = EXT3_BLOCKS_PER_GROUP(sb);
913    }
914
915    BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
916
917repeat:
918    if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
919        grp_goal = find_next_usable_block(start, bitmap_bh, end);
920        if (grp_goal < 0)
921            goto fail_access;
922        if (!my_rsv) {
923            int i;
924
925            for (i = 0; i < 7 && grp_goal > start &&
926                    ext3_test_allocatable(grp_goal - 1,
927                                bitmap_bh);
928                    i++, grp_goal--)
929                ;
930        }
931    }
932    start = grp_goal;
933
934    if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
935        grp_goal, bitmap_bh)) {
936        /*
937         * The block was allocated by another thread, or it was
938         * allocated and then freed by another thread
939         */
940        start++;
941        grp_goal++;
942        if (start >= end)
943            goto fail_access;
944        goto repeat;
945    }
946    num++;
947    grp_goal++;
948    while (num < *count && grp_goal < end
949        && ext3_test_allocatable(grp_goal, bitmap_bh)
950        && claim_block(sb_bgl_lock(EXT3_SB(sb), group),
951                grp_goal, bitmap_bh)) {
952        num++;
953        grp_goal++;
954    }
955    *count = num;
956    return grp_goal - num;
957fail_access:
958    *count = num;
959    return -1;
960}
961
962/**
963 * find_next_reservable_window():
964 * find a reservable space within the given range.
965 * It does not allocate the reservation window for now:
966 * alloc_new_reservation() will do the work later.
967 *
968 * @search_head: the head of the searching list;
969 * This is not necessarily the list head of the whole filesystem
970 *
971 * We have both head and start_block to assist the search
972 * for the reservable space. The list starts from head,
973 * but we will shift to the place where start_block is,
974 * then start from there, when looking for a reservable space.
975 *
976 * @my_rsv: the reservation window
977 *
978 * @sb: the super block
979 *
980 * @start_block: the first block we consider to start
981 * the real search from
982 *
983 * @last_block:
984 * the maximum block number that our goal reservable space
985 * could start from. This is normally the last block in this
986 * group. The search will end when we found the start of next
987 * possible reservable space is out of this boundary.
988 * This could handle the cross boundary reservation window
989 * request.
990 *
991 * basically we search from the given range, rather than the whole
992 * reservation double linked list, (start_block, last_block)
993 * to find a free region that is of my size and has not
994 * been reserved.
995 *
996 */
997static int find_next_reservable_window(
998                struct ext3_reserve_window_node *search_head,
999                struct ext3_reserve_window_node *my_rsv,
1000                struct super_block * sb,
1001                ext3_fsblk_t start_block,
1002                ext3_fsblk_t last_block)
1003{
1004    struct rb_node *next;
1005    struct ext3_reserve_window_node *rsv, *prev;
1006    ext3_fsblk_t cur;
1007    int size = my_rsv->rsv_goal_size;
1008
1009    /* TODO: make the start of the reservation window byte-aligned */
1010    /* cur = *start_block & ~7;*/
1011    cur = start_block;
1012    rsv = search_head;
1013    if (!rsv)
1014        return -1;
1015
1016    while (1) {
1017        if (cur <= rsv->rsv_end)
1018            cur = rsv->rsv_end + 1;
1019
1020        /* TODO?
1021         * in the case we could not find a reservable space
1022         * that is what is expected, during the re-search, we could
1023         * remember what's the largest reservable space we could have
1024         * and return that one.
1025         *
1026         * For now it will fail if we could not find the reservable
1027         * space with expected-size (or more)...
1028         */
1029        if (cur > last_block)
1030            return -1; /* fail */
1031
1032        prev = rsv;
1033        next = rb_next(&rsv->rsv_node);
1034        rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
1035
1036        /*
1037         * Reached the last reservation, we can just append to the
1038         * previous one.
1039         */
1040        if (!next)
1041            break;
1042
1043        if (cur + size <= rsv->rsv_start) {
1044            /*
1045             * Found a reserveable space big enough. We could
1046             * have a reservation across the group boundary here
1047             */
1048            break;
1049        }
1050    }
1051    /*
1052     * we come here either :
1053     * when we reach the end of the whole list,
1054     * and there is empty reservable space after last entry in the list.
1055     * append it to the end of the list.
1056     *
1057     * or we found one reservable space in the middle of the list,
1058     * return the reservation window that we could append to.
1059     * succeed.
1060     */
1061
1062    if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
1063        rsv_window_remove(sb, my_rsv);
1064
1065    /*
1066     * Let's book the whole available window for now. We will check the
1067     * disk bitmap later and then, if there are free blocks then we adjust
1068     * the window size if it's larger than requested.
1069     * Otherwise, we will remove this node from the tree next time
1070     * call find_next_reservable_window.
1071     */
1072    my_rsv->rsv_start = cur;
1073    my_rsv->rsv_end = cur + size - 1;
1074    my_rsv->rsv_alloc_hit = 0;
1075
1076    if (prev != my_rsv)
1077        ext3_rsv_window_add(sb, my_rsv);
1078
1079    return 0;
1080}
1081
1082/**
1083 * alloc_new_reservation()--allocate a new reservation window
1084 *
1085 * To make a new reservation, we search part of the filesystem
1086 * reservation list (the list that inside the group). We try to
1087 * allocate a new reservation window near the allocation goal,
1088 * or the beginning of the group, if there is no goal.
1089 *
1090 * We first find a reservable space after the goal, then from
1091 * there, we check the bitmap for the first free block after
1092 * it. If there is no free block until the end of group, then the
1093 * whole group is full, we failed. Otherwise, check if the free
1094 * block is inside the expected reservable space, if so, we
1095 * succeed.
1096 * If the first free block is outside the reservable space, then
1097 * start from the first free block, we search for next available
1098 * space, and go on.
1099 *
1100 * on succeed, a new reservation will be found and inserted into the list
1101 * It contains at least one free block, and it does not overlap with other
1102 * reservation windows.
1103 *
1104 * failed: we failed to find a reservation window in this group
1105 *
1106 * @my_rsv: the reservation window
1107 *
1108 * @grp_goal: The goal (group-relative). It is where the search for a
1109 * free reservable space should start from.
1110 * if we have a grp_goal(grp_goal >0 ), then start from there,
1111 * no grp_goal(grp_goal = -1), we start from the first block
1112 * of the group.
1113 *
1114 * @sb: the super block
1115 * @group: the group we are trying to allocate in
1116 * @bitmap_bh: the block group block bitmap
1117 *
1118 */
1119static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
1120        ext3_grpblk_t grp_goal, struct super_block *sb,
1121        unsigned int group, struct buffer_head *bitmap_bh)
1122{
1123    struct ext3_reserve_window_node *search_head;
1124    ext3_fsblk_t group_first_block, group_end_block, start_block;
1125    ext3_grpblk_t first_free_block;
1126    struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
1127    unsigned long size;
1128    int ret;
1129    spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1130
1131    group_first_block = ext3_group_first_block_no(sb, group);
1132    group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1133
1134    if (grp_goal < 0)
1135        start_block = group_first_block;
1136    else
1137        start_block = grp_goal + group_first_block;
1138
1139    size = my_rsv->rsv_goal_size;
1140
1141    if (!rsv_is_empty(&my_rsv->rsv_window)) {
1142        /*
1143         * if the old reservation is cross group boundary
1144         * and if the goal is inside the old reservation window,
1145         * we will come here when we just failed to allocate from
1146         * the first part of the window. We still have another part
1147         * that belongs to the next group. In this case, there is no
1148         * point to discard our window and try to allocate a new one
1149         * in this group(which will fail). we should
1150         * keep the reservation window, just simply move on.
1151         *
1152         * Maybe we could shift the start block of the reservation
1153         * window to the first block of next group.
1154         */
1155
1156        if ((my_rsv->rsv_start <= group_end_block) &&
1157                (my_rsv->rsv_end > group_end_block) &&
1158                (start_block >= my_rsv->rsv_start))
1159            return -1;
1160
1161        if ((my_rsv->rsv_alloc_hit >
1162             (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1163            /*
1164             * if the previously allocation hit ratio is
1165             * greater than 1/2, then we double the size of
1166             * the reservation window the next time,
1167             * otherwise we keep the same size window
1168             */
1169            size = size * 2;
1170            if (size > EXT3_MAX_RESERVE_BLOCKS)
1171                size = EXT3_MAX_RESERVE_BLOCKS;
1172            my_rsv->rsv_goal_size= size;
1173        }
1174    }
1175
1176    spin_lock(rsv_lock);
1177    /*
1178     * shift the search start to the window near the goal block
1179     */
1180    search_head = search_reserve_window(fs_rsv_root, start_block);
1181
1182    /*
1183     * find_next_reservable_window() simply finds a reservable window
1184     * inside the given range(start_block, group_end_block).
1185     *
1186     * To make sure the reservation window has a free bit inside it, we
1187     * need to check the bitmap after we found a reservable window.
1188     */
1189retry:
1190    ret = find_next_reservable_window(search_head, my_rsv, sb,
1191                        start_block, group_end_block);
1192
1193    if (ret == -1) {
1194        if (!rsv_is_empty(&my_rsv->rsv_window))
1195            rsv_window_remove(sb, my_rsv);
1196        spin_unlock(rsv_lock);
1197        return -1;
1198    }
1199
1200    /*
1201     * On success, find_next_reservable_window() returns the
1202     * reservation window where there is a reservable space after it.
1203     * Before we reserve this reservable space, we need
1204     * to make sure there is at least a free block inside this region.
1205     *
1206     * searching the first free bit on the block bitmap and copy of
1207     * last committed bitmap alternatively, until we found a allocatable
1208     * block. Search start from the start block of the reservable space
1209     * we just found.
1210     */
1211    spin_unlock(rsv_lock);
1212    first_free_block = bitmap_search_next_usable_block(
1213            my_rsv->rsv_start - group_first_block,
1214            bitmap_bh, group_end_block - group_first_block + 1);
1215
1216    if (first_free_block < 0) {
1217        /*
1218         * no free block left on the bitmap, no point
1219         * to reserve the space. return failed.
1220         */
1221        spin_lock(rsv_lock);
1222        if (!rsv_is_empty(&my_rsv->rsv_window))
1223            rsv_window_remove(sb, my_rsv);
1224        spin_unlock(rsv_lock);
1225        return -1; /* failed */
1226    }
1227
1228    start_block = first_free_block + group_first_block;
1229    /*
1230     * check if the first free block is within the
1231     * free space we just reserved
1232     */
1233    if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1234        return 0; /* success */
1235    /*
1236     * if the first free bit we found is out of the reservable space
1237     * continue search for next reservable space,
1238     * start from where the free block is,
1239     * we also shift the list head to where we stopped last time
1240     */
1241    search_head = my_rsv;
1242    spin_lock(rsv_lock);
1243    goto retry;
1244}
1245
1246/**
1247 * try_to_extend_reservation()
1248 * @my_rsv: given reservation window
1249 * @sb: super block
1250 * @size: the delta to extend
1251 *
1252 * Attempt to expand the reservation window large enough to have
1253 * required number of free blocks
1254 *
1255 * Since ext3_try_to_allocate() will always allocate blocks within
1256 * the reservation window range, if the window size is too small,
1257 * multiple blocks allocation has to stop at the end of the reservation
1258 * window. To make this more efficient, given the total number of
1259 * blocks needed and the current size of the window, we try to
1260 * expand the reservation window size if necessary on a best-effort
1261 * basis before ext3_new_blocks() tries to allocate blocks,
1262 */
1263static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1264            struct super_block *sb, int size)
1265{
1266    struct ext3_reserve_window_node *next_rsv;
1267    struct rb_node *next;
1268    spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1269
1270    if (!spin_trylock(rsv_lock))
1271        return;
1272
1273    next = rb_next(&my_rsv->rsv_node);
1274
1275    if (!next)
1276        my_rsv->rsv_end += size;
1277    else {
1278        next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
1279
1280        if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1281            my_rsv->rsv_end += size;
1282        else
1283            my_rsv->rsv_end = next_rsv->rsv_start - 1;
1284    }
1285    spin_unlock(rsv_lock);
1286}
1287
1288/**
1289 * ext3_try_to_allocate_with_rsv()
1290 * @sb: superblock
1291 * @handle: handle to this transaction
1292 * @group: given allocation block group
1293 * @bitmap_bh: bufferhead holds the block bitmap
1294 * @grp_goal: given target block within the group
1295 * @my_rsv: reservation window
1296 * @count: target number of blocks to allocate
1297 * @errp: pointer to store the error code
1298 *
1299 * This is the main function used to allocate a new block and its reservation
1300 * window.
1301 *
1302 * Each time when a new block allocation is need, first try to allocate from
1303 * its own reservation. If it does not have a reservation window, instead of
1304 * looking for a free bit on bitmap first, then look up the reservation list to
1305 * see if it is inside somebody else's reservation window, we try to allocate a
1306 * reservation window for it starting from the goal first. Then do the block
1307 * allocation within the reservation window.
1308 *
1309 * This will avoid keeping on searching the reservation list again and
1310 * again when somebody is looking for a free block (without
1311 * reservation), and there are lots of free blocks, but they are all
1312 * being reserved.
1313 *
1314 * We use a red-black tree for the per-filesystem reservation list.
1315 *
1316 */
1317static ext3_grpblk_t
1318ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1319            unsigned int group, struct buffer_head *bitmap_bh,
1320            ext3_grpblk_t grp_goal,
1321            struct ext3_reserve_window_node * my_rsv,
1322            unsigned long *count, int *errp)
1323{
1324    ext3_fsblk_t group_first_block, group_last_block;
1325    ext3_grpblk_t ret = 0;
1326    int fatal;
1327    unsigned long num = *count;
1328
1329    *errp = 0;
1330
1331    /*
1332     * Make sure we use undo access for the bitmap, because it is critical
1333     * that we do the frozen_data COW on bitmap buffers in all cases even
1334     * if the buffer is in BJ_Forget state in the committing transaction.
1335     */
1336    BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1337    fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
1338    if (fatal) {
1339        *errp = fatal;
1340        return -1;
1341    }
1342
1343    /*
1344     * we don't deal with reservation when
1345     * filesystem is mounted without reservation
1346     * or the file is not a regular file
1347     * or last attempt to allocate a block with reservation turned on failed
1348     */
1349    if (my_rsv == NULL ) {
1350        ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1351                        grp_goal, count, NULL);
1352        goto out;
1353    }
1354    /*
1355     * grp_goal is a group relative block number (if there is a goal)
1356     * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
1357     * first block is a filesystem wide block number
1358     * first block is the block number of the first block in this group
1359     */
1360    group_first_block = ext3_group_first_block_no(sb, group);
1361    group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1362
1363    /*
1364     * Basically we will allocate a new block from inode's reservation
1365     * window.
1366     *
1367     * We need to allocate a new reservation window, if:
1368     * a) inode does not have a reservation window; or
1369     * b) last attempt to allocate a block from existing reservation
1370     * failed; or
1371     * c) we come here with a goal and with a reservation window
1372     *
1373     * We do not need to allocate a new reservation window if we come here
1374     * at the beginning with a goal and the goal is inside the window, or
1375     * we don't have a goal but already have a reservation window.
1376     * then we could go to allocate from the reservation window directly.
1377     */
1378    while (1) {
1379        if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1380            !goal_in_my_reservation(&my_rsv->rsv_window,
1381                        grp_goal, group, sb)) {
1382            if (my_rsv->rsv_goal_size < *count)
1383                my_rsv->rsv_goal_size = *count;
1384            ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1385                            group, bitmap_bh);
1386            if (ret < 0)
1387                break; /* failed */
1388
1389            if (!goal_in_my_reservation(&my_rsv->rsv_window,
1390                            grp_goal, group, sb))
1391                grp_goal = -1;
1392        } else if (grp_goal >= 0) {
1393            int curr = my_rsv->rsv_end -
1394                    (grp_goal + group_first_block) + 1;
1395
1396            if (curr < *count)
1397                try_to_extend_reservation(my_rsv, sb,
1398                            *count - curr);
1399        }
1400
1401        if ((my_rsv->rsv_start > group_last_block) ||
1402                (my_rsv->rsv_end < group_first_block)) {
1403            rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
1404            BUG();
1405        }
1406        ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1407                       grp_goal, &num, &my_rsv->rsv_window);
1408        if (ret >= 0) {
1409            my_rsv->rsv_alloc_hit += num;
1410            *count = num;
1411            break; /* succeed */
1412        }
1413        num = *count;
1414    }
1415out:
1416    if (ret >= 0) {
1417        BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1418                    "bitmap block");
1419        fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
1420        if (fatal) {
1421            *errp = fatal;
1422            return -1;
1423        }
1424        return ret;
1425    }
1426
1427    BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1428    ext3_journal_release_buffer(handle, bitmap_bh);
1429    return ret;
1430}
1431
1432/**
1433 * ext3_has_free_blocks()
1434 * @sbi: in-core super block structure.
1435 *
1436 * Check if filesystem has at least 1 free block available for allocation.
1437 */
1438static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
1439{
1440    ext3_fsblk_t free_blocks, root_blocks;
1441
1442    free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1443    root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
1444    if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1445        sbi->s_resuid != current_fsuid() &&
1446        (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1447        return 0;
1448    }
1449    return 1;
1450}
1451
1452/**
1453 * ext3_should_retry_alloc()
1454 * @sb: super block
1455 * @retries number of attemps has been made
1456 *
1457 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
1458 * it is profitable to retry the operation, this function will wait
1459 * for the current or committing transaction to complete, and then
1460 * return TRUE.
1461 *
1462 * if the total number of retries exceed three times, return FALSE.
1463 */
1464int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1465{
1466    if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
1467        return 0;
1468
1469    jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1470
1471    return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
1472}
1473
1474/**
1475 * ext3_new_blocks() -- core block(s) allocation function
1476 * @handle: handle to this transaction
1477 * @inode: file inode
1478 * @goal: given target block(filesystem wide)
1479 * @count: target number of blocks to allocate
1480 * @errp: error code
1481 *
1482 * ext3_new_blocks uses a goal block to assist allocation. It tries to
1483 * allocate block(s) from the block group contains the goal block first. If that
1484 * fails, it will try to allocate block(s) from other block groups without
1485 * any specific goal block.
1486 *
1487 */
1488ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1489            ext3_fsblk_t goal, unsigned long *count, int *errp)
1490{
1491    struct buffer_head *bitmap_bh = NULL;
1492    struct buffer_head *gdp_bh;
1493    int group_no;
1494    int goal_group;
1495    ext3_grpblk_t grp_target_blk; /* blockgroup relative goal block */
1496    ext3_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
1497    ext3_fsblk_t ret_block; /* filesyetem-wide allocated block */
1498    int bgi; /* blockgroup iteration index */
1499    int fatal = 0, err;
1500    int performed_allocation = 0;
1501    ext3_grpblk_t free_blocks; /* number of free blocks in a group */
1502    struct super_block *sb;
1503    struct ext3_group_desc *gdp;
1504    struct ext3_super_block *es;
1505    struct ext3_sb_info *sbi;
1506    struct ext3_reserve_window_node *my_rsv = NULL;
1507    struct ext3_block_alloc_info *block_i;
1508    unsigned short windowsz = 0;
1509#ifdef EXT3FS_DEBUG
1510    static int goal_hits, goal_attempts;
1511#endif
1512    unsigned long ngroups;
1513    unsigned long num = *count;
1514
1515    *errp = -ENOSPC;
1516    sb = inode->i_sb;
1517    if (!sb) {
1518        printk("ext3_new_block: nonexistent device");
1519        return 0;
1520    }
1521
1522    /*
1523     * Check quota for allocation of this block.
1524     */
1525    err = dquot_alloc_block(inode, num);
1526    if (err) {
1527        *errp = err;
1528        return 0;
1529    }
1530
1531    sbi = EXT3_SB(sb);
1532    es = EXT3_SB(sb)->s_es;
1533    ext3_debug("goal=%lu.\n", goal);
1534    /*
1535     * Allocate a block from reservation only when
1536     * filesystem is mounted with reservation(default,-o reservation), and
1537     * it's a regular file, and
1538     * the desired window size is greater than 0 (One could use ioctl
1539     * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
1540     * reservation on that particular file)
1541     */
1542    block_i = EXT3_I(inode)->i_block_alloc_info;
1543    if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1544        my_rsv = &block_i->rsv_window_node;
1545
1546    if (!ext3_has_free_blocks(sbi)) {
1547        *errp = -ENOSPC;
1548        goto out;
1549    }
1550
1551    /*
1552     * First, test whether the goal block is free.
1553     */
1554    if (goal < le32_to_cpu(es->s_first_data_block) ||
1555        goal >= le32_to_cpu(es->s_blocks_count))
1556        goal = le32_to_cpu(es->s_first_data_block);
1557    group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
1558            EXT3_BLOCKS_PER_GROUP(sb);
1559    goal_group = group_no;
1560retry_alloc:
1561    gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1562    if (!gdp)
1563        goto io_error;
1564
1565    free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1566    /*
1567     * if there is not enough free blocks to make a new resevation
1568     * turn off reservation for this allocation
1569     */
1570    if (my_rsv && (free_blocks < windowsz)
1571        && (free_blocks > 0)
1572        && (rsv_is_empty(&my_rsv->rsv_window)))
1573        my_rsv = NULL;
1574
1575    if (free_blocks > 0) {
1576        grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
1577                EXT3_BLOCKS_PER_GROUP(sb));
1578        bitmap_bh = read_block_bitmap(sb, group_no);
1579        if (!bitmap_bh)
1580            goto io_error;
1581        grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1582                    group_no, bitmap_bh, grp_target_blk,
1583                    my_rsv, &num, &fatal);
1584        if (fatal)
1585            goto out;
1586        if (grp_alloc_blk >= 0)
1587            goto allocated;
1588    }
1589
1590    ngroups = EXT3_SB(sb)->s_groups_count;
1591    smp_rmb();
1592
1593    /*
1594     * Now search the rest of the groups. We assume that
1595     * group_no and gdp correctly point to the last group visited.
1596     */
1597    for (bgi = 0; bgi < ngroups; bgi++) {
1598        group_no++;
1599        if (group_no >= ngroups)
1600            group_no = 0;
1601        gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1602        if (!gdp)
1603            goto io_error;
1604        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1605        /*
1606         * skip this group (and avoid loading bitmap) if there
1607         * are no free blocks
1608         */
1609        if (!free_blocks)
1610            continue;
1611        /*
1612         * skip this group if the number of
1613         * free blocks is less than half of the reservation
1614         * window size.
1615         */
1616        if (my_rsv && (free_blocks <= (windowsz/2)))
1617            continue;
1618
1619        brelse(bitmap_bh);
1620        bitmap_bh = read_block_bitmap(sb, group_no);
1621        if (!bitmap_bh)
1622            goto io_error;
1623        /*
1624         * try to allocate block(s) from this group, without a goal(-1).
1625         */
1626        grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1627                    group_no, bitmap_bh, -1, my_rsv,
1628                    &num, &fatal);
1629        if (fatal)
1630            goto out;
1631        if (grp_alloc_blk >= 0)
1632            goto allocated;
1633    }
1634    /*
1635     * We may end up a bogus earlier ENOSPC error due to
1636     * filesystem is "full" of reservations, but
1637     * there maybe indeed free blocks available on disk
1638     * In this case, we just forget about the reservations
1639     * just do block allocation as without reservations.
1640     */
1641    if (my_rsv) {
1642        my_rsv = NULL;
1643        windowsz = 0;
1644        group_no = goal_group;
1645        goto retry_alloc;
1646    }
1647    /* No space left on the device */
1648    *errp = -ENOSPC;
1649    goto out;
1650
1651allocated:
1652
1653    ext3_debug("using block group %d(%d)\n",
1654            group_no, gdp->bg_free_blocks_count);
1655
1656    BUFFER_TRACE(gdp_bh, "get_write_access");
1657    fatal = ext3_journal_get_write_access(handle, gdp_bh);
1658    if (fatal)
1659        goto out;
1660
1661    ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
1662
1663    if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
1664        in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
1665        in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
1666              EXT3_SB(sb)->s_itb_per_group) ||
1667        in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
1668              EXT3_SB(sb)->s_itb_per_group)) {
1669        ext3_error(sb, "ext3_new_block",
1670                "Allocating block in system zone - "
1671                "blocks from "E3FSBLK", length %lu",
1672                 ret_block, num);
1673        /*
1674         * claim_block() marked the blocks we allocated as in use. So we
1675         * may want to selectively mark some of the blocks as free.
1676         */
1677        goto retry_alloc;
1678    }
1679
1680    performed_allocation = 1;
1681
1682#ifdef CONFIG_JBD_DEBUG
1683    {
1684        struct buffer_head *debug_bh;
1685
1686        /* Record bitmap buffer state in the newly allocated block */
1687        debug_bh = sb_find_get_block(sb, ret_block);
1688        if (debug_bh) {
1689            BUFFER_TRACE(debug_bh, "state when allocated");
1690            BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1691            brelse(debug_bh);
1692        }
1693    }
1694    jbd_lock_bh_state(bitmap_bh);
1695    spin_lock(sb_bgl_lock(sbi, group_no));
1696    if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1697        int i;
1698
1699        for (i = 0; i < num; i++) {
1700            if (ext3_test_bit(grp_alloc_blk+i,
1701                    bh2jh(bitmap_bh)->b_committed_data)) {
1702                printk("%s: block was unexpectedly set in "
1703                    "b_committed_data\n", __func__);
1704            }
1705        }
1706    }
1707    ext3_debug("found bit %d\n", grp_alloc_blk);
1708    spin_unlock(sb_bgl_lock(sbi, group_no));
1709    jbd_unlock_bh_state(bitmap_bh);
1710#endif
1711
1712    if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
1713        ext3_error(sb, "ext3_new_block",
1714                "block("E3FSBLK") >= blocks count(%d) - "
1715                "block_group = %d, es == %p ", ret_block,
1716            le32_to_cpu(es->s_blocks_count), group_no, es);
1717        goto out;
1718    }
1719
1720    /*
1721     * It is up to the caller to add the new buffer to a journal
1722     * list of some description. We don't know in advance whether
1723     * the caller wants to use it as metadata or data.
1724     */
1725    ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
1726            ret_block, goal_hits, goal_attempts);
1727
1728    spin_lock(sb_bgl_lock(sbi, group_no));
1729    le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1730    spin_unlock(sb_bgl_lock(sbi, group_no));
1731    percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1732
1733    BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1734    err = ext3_journal_dirty_metadata(handle, gdp_bh);
1735    if (!fatal)
1736        fatal = err;
1737
1738    if (fatal)
1739        goto out;
1740
1741    *errp = 0;
1742    brelse(bitmap_bh);
1743    dquot_free_block(inode, *count-num);
1744    *count = num;
1745    return ret_block;
1746
1747io_error:
1748    *errp = -EIO;
1749out:
1750    if (fatal) {
1751        *errp = fatal;
1752        ext3_std_error(sb, fatal);
1753    }
1754    /*
1755     * Undo the block allocation
1756     */
1757    if (!performed_allocation)
1758        dquot_free_block(inode, *count);
1759    brelse(bitmap_bh);
1760    return 0;
1761}
1762
1763ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
1764            ext3_fsblk_t goal, int *errp)
1765{
1766    unsigned long count = 1;
1767
1768    return ext3_new_blocks(handle, inode, goal, &count, errp);
1769}
1770
1771/**
1772 * ext3_count_free_blocks() -- count filesystem free blocks
1773 * @sb: superblock
1774 *
1775 * Adds up the number of free blocks from each block group.
1776 */
1777ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
1778{
1779    ext3_fsblk_t desc_count;
1780    struct ext3_group_desc *gdp;
1781    int i;
1782    unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
1783#ifdef EXT3FS_DEBUG
1784    struct ext3_super_block *es;
1785    ext3_fsblk_t bitmap_count;
1786    unsigned long x;
1787    struct buffer_head *bitmap_bh = NULL;
1788
1789    es = EXT3_SB(sb)->s_es;
1790    desc_count = 0;
1791    bitmap_count = 0;
1792    gdp = NULL;
1793
1794    smp_rmb();
1795    for (i = 0; i < ngroups; i++) {
1796        gdp = ext3_get_group_desc(sb, i, NULL);
1797        if (!gdp)
1798            continue;
1799        desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1800        brelse(bitmap_bh);
1801        bitmap_bh = read_block_bitmap(sb, i);
1802        if (bitmap_bh == NULL)
1803            continue;
1804
1805        x = ext3_count_free(bitmap_bh, sb->s_blocksize);
1806        printk("group %d: stored = %d, counted = %lu\n",
1807            i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1808        bitmap_count += x;
1809    }
1810    brelse(bitmap_bh);
1811    printk("ext3_count_free_blocks: stored = "E3FSBLK
1812        ", computed = "E3FSBLK", "E3FSBLK"\n",
1813           le32_to_cpu(es->s_free_blocks_count),
1814        desc_count, bitmap_count);
1815    return bitmap_count;
1816#else
1817    desc_count = 0;
1818    smp_rmb();
1819    for (i = 0; i < ngroups; i++) {
1820        gdp = ext3_get_group_desc(sb, i, NULL);
1821        if (!gdp)
1822            continue;
1823        desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1824    }
1825
1826    return desc_count;
1827#endif
1828}
1829
1830static inline int test_root(int a, int b)
1831{
1832    int num = b;
1833
1834    while (a > num)
1835        num *= b;
1836    return num == a;
1837}
1838
1839static int ext3_group_sparse(int group)
1840{
1841    if (group <= 1)
1842        return 1;
1843    if (!(group & 1))
1844        return 0;
1845    return (test_root(group, 7) || test_root(group, 5) ||
1846        test_root(group, 3));
1847}
1848
1849/**
1850 * ext3_bg_has_super - number of blocks used by the superblock in group
1851 * @sb: superblock for filesystem
1852 * @group: group number to check
1853 *
1854 * Return the number of blocks used by the superblock (primary or backup)
1855 * in this group. Currently this will be only 0 or 1.
1856 */
1857int ext3_bg_has_super(struct super_block *sb, int group)
1858{
1859    if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
1860                EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1861            !ext3_group_sparse(group))
1862        return 0;
1863    return 1;
1864}
1865
1866static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
1867{
1868    unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1869    unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
1870    unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
1871
1872    if (group == first || group == first + 1 || group == last)
1873        return 1;
1874    return 0;
1875}
1876
1877static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
1878{
1879    return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
1880}
1881
1882/**
1883 * ext3_bg_num_gdb - number of blocks used by the group table in group
1884 * @sb: superblock for filesystem
1885 * @group: group number to check
1886 *
1887 * Return the number of blocks used by the group descriptor table
1888 * (primary or backup) in this group. In the future there may be a
1889 * different number of descriptor blocks in each group.
1890 */
1891unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1892{
1893    unsigned long first_meta_bg =
1894            le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
1895    unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1896
1897    if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
1898            metagroup < first_meta_bg)
1899        return ext3_bg_num_gdb_nometa(sb,group);
1900
1901    return ext3_bg_num_gdb_meta(sb,group);
1902
1903}
1904
1905/**
1906 * ext3_trim_all_free -- function to trim all free space in alloc. group
1907 * @sb: super block for file system
1908 * @group: allocation group to trim
1909 * @start: first group block to examine
1910 * @max: last group block to examine
1911 * @gdp: allocation group description structure
1912 * @minblocks: minimum extent block count
1913 *
1914 * ext3_trim_all_free walks through group's block bitmap searching for free
1915 * blocks. When the free block is found, it tries to allocate this block and
1916 * consequent free block to get the biggest free extent possible, until it
1917 * reaches any used block. Then issue a TRIM command on this extent and free
1918 * the extent in the block bitmap. This is done until whole group is scanned.
1919 */
1920ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
1921                ext3_grpblk_t start, ext3_grpblk_t max,
1922                ext3_grpblk_t minblocks)
1923{
1924    handle_t *handle;
1925    ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
1926    ext3_fsblk_t discard_block;
1927    struct ext3_sb_info *sbi;
1928    struct buffer_head *gdp_bh, *bitmap_bh = NULL;
1929    struct ext3_group_desc *gdp;
1930    int err = 0, ret = 0;
1931
1932    /*
1933     * We will update one block bitmap, and one group descriptor
1934     */
1935    handle = ext3_journal_start_sb(sb, 2);
1936    if (IS_ERR(handle))
1937        return PTR_ERR(handle);
1938
1939    bitmap_bh = read_block_bitmap(sb, group);
1940    if (!bitmap_bh) {
1941        err = -EIO;
1942        goto err_out;
1943    }
1944
1945    BUFFER_TRACE(bitmap_bh, "getting undo access");
1946    err = ext3_journal_get_undo_access(handle, bitmap_bh);
1947    if (err)
1948        goto err_out;
1949
1950    gdp = ext3_get_group_desc(sb, group, &gdp_bh);
1951    if (!gdp) {
1952        err = -EIO;
1953        goto err_out;
1954    }
1955
1956    BUFFER_TRACE(gdp_bh, "get_write_access");
1957    err = ext3_journal_get_write_access(handle, gdp_bh);
1958    if (err)
1959        goto err_out;
1960
1961    free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1962    sbi = EXT3_SB(sb);
1963
1964     /* Walk through the whole group */
1965    while (start < max) {
1966        start = bitmap_search_next_usable_block(start, bitmap_bh, max);
1967        if (start < 0)
1968            break;
1969        next = start;
1970
1971        /*
1972         * Allocate contiguous free extents by setting bits in the
1973         * block bitmap
1974         */
1975        while (next < max
1976            && claim_block(sb_bgl_lock(sbi, group),
1977                    next, bitmap_bh)) {
1978            next++;
1979        }
1980
1981         /* We did not claim any blocks */
1982        if (next == start)
1983            continue;
1984
1985        discard_block = (ext3_fsblk_t)start +
1986                ext3_group_first_block_no(sb, group);
1987
1988        /* Update counters */
1989        spin_lock(sb_bgl_lock(sbi, group));
1990        le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
1991        spin_unlock(sb_bgl_lock(sbi, group));
1992        percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1993
1994        free_blocks -= next - start;
1995        /* Do not issue a TRIM on extents smaller than minblocks */
1996        if ((next - start) < minblocks)
1997            goto free_extent;
1998
1999         /* Send the TRIM command down to the device */
2000        err = sb_issue_discard(sb, discard_block, next - start,
2001                       GFP_NOFS, 0);
2002        count += (next - start);
2003free_extent:
2004        freed = 0;
2005
2006        /*
2007         * Clear bits in the bitmap
2008         */
2009        for (bit = start; bit < next; bit++) {
2010            BUFFER_TRACE(bitmap_bh, "clear bit");
2011            if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
2012                        bit, bitmap_bh->b_data)) {
2013                ext3_error(sb, __func__,
2014                    "bit already cleared for block "E3FSBLK,
2015                     (unsigned long)bit);
2016                BUFFER_TRACE(bitmap_bh, "bit already cleared");
2017            } else {
2018                freed++;
2019            }
2020        }
2021
2022        /* Update couters */
2023        spin_lock(sb_bgl_lock(sbi, group));
2024        le16_add_cpu(&gdp->bg_free_blocks_count, freed);
2025        spin_unlock(sb_bgl_lock(sbi, group));
2026        percpu_counter_add(&sbi->s_freeblocks_counter, freed);
2027
2028        start = next;
2029        if (err < 0) {
2030            if (err != -EOPNOTSUPP)
2031                ext3_warning(sb, __func__, "Discard command "
2032                         "returned error %d\n", err);
2033            break;
2034        }
2035
2036        if (fatal_signal_pending(current)) {
2037            err = -ERESTARTSYS;
2038            break;
2039        }
2040
2041        cond_resched();
2042
2043        /* No more suitable extents */
2044        if (free_blocks < minblocks)
2045            break;
2046    }
2047
2048    /* We dirtied the bitmap block */
2049    BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2050    ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
2051    if (!err)
2052        err = ret;
2053
2054    /* And the group descriptor block */
2055    BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
2056    ret = ext3_journal_dirty_metadata(handle, gdp_bh);
2057    if (!err)
2058        err = ret;
2059
2060    ext3_debug("trimmed %d blocks in the group %d\n",
2061        count, group);
2062
2063err_out:
2064    if (err)
2065        count = err;
2066    ext3_journal_stop(handle);
2067    brelse(bitmap_bh);
2068
2069    return count;
2070}
2071
2072/**
2073 * ext3_trim_fs() -- trim ioctl handle function
2074 * @sb: superblock for filesystem
2075 * @start: First Byte to trim
2076 * @len: number of Bytes to trim from start
2077 * @minlen: minimum extent length in Bytes
2078 *
2079 * ext3_trim_fs goes through all allocation groups containing Bytes from
2080 * start to start+len. For each such a group ext3_trim_all_free function
2081 * is invoked to trim all free space.
2082 */
2083int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2084{
2085    ext3_grpblk_t last_block, first_block, free_blocks;
2086    unsigned long first_group, last_group;
2087    unsigned long group, ngroups;
2088    struct ext3_group_desc *gdp;
2089    struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2090    uint64_t start, len, minlen, trimmed;
2091    ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2092    int ret = 0;
2093
2094    start = (range->start >> sb->s_blocksize_bits) +
2095        le32_to_cpu(es->s_first_data_block);
2096    len = range->len >> sb->s_blocksize_bits;
2097    minlen = range->minlen >> sb->s_blocksize_bits;
2098    trimmed = 0;
2099
2100    if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
2101        return -EINVAL;
2102    if (start >= max_blks)
2103        goto out;
2104    if (start + len > max_blks)
2105        len = max_blks - start;
2106
2107    ngroups = EXT3_SB(sb)->s_groups_count;
2108    smp_rmb();
2109
2110    /* Determine first and last group to examine based on start and len */
2111    ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
2112                     &first_group, &first_block);
2113    ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
2114                     &last_group, &last_block);
2115    last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
2116    last_block = EXT3_BLOCKS_PER_GROUP(sb);
2117
2118    if (first_group > last_group)
2119        return -EINVAL;
2120
2121    for (group = first_group; group <= last_group; group++) {
2122        gdp = ext3_get_group_desc(sb, group, NULL);
2123        if (!gdp)
2124            break;
2125
2126        free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
2127        if (free_blocks < minlen)
2128            continue;
2129
2130        /*
2131         * For all the groups except the last one, last block will
2132         * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
2133         * change it for the last group in which case first_block +
2134         * len < EXT3_BLOCKS_PER_GROUP(sb).
2135         */
2136        if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
2137            last_block = first_block + len;
2138        len -= last_block - first_block;
2139
2140        ret = ext3_trim_all_free(sb, group, first_block,
2141                    last_block, minlen);
2142        if (ret < 0)
2143            break;
2144
2145        trimmed += ret;
2146        first_block = 0;
2147    }
2148
2149    if (ret >= 0)
2150        ret = 0;
2151
2152out:
2153    range->len = trimmed * sb->s_blocksize;
2154
2155    return ret;
2156}
2157

Archive Download this file



interactive