Root/fs/ocfs2/suballoc.c

1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * suballoc.c
5 *
6 * metadata alloc and free
7 * Inspired by ext3 block groups.
8 *
9 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public
22 * License along with this program; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 021110-1307, USA.
25 */
26
27#include <linux/fs.h>
28#include <linux/types.h>
29#include <linux/slab.h>
30#include <linux/highmem.h>
31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h>
34
35#include "ocfs2.h"
36
37#include "alloc.h"
38#include "blockcheck.h"
39#include "dlmglue.h"
40#include "inode.h"
41#include "journal.h"
42#include "localalloc.h"
43#include "suballoc.h"
44#include "super.h"
45#include "sysfile.h"
46#include "uptodate.h"
47
48#include "buffer_head_io.h"
49
50#define NOT_ALLOC_NEW_GROUP 0
51#define ALLOC_NEW_GROUP 0x1
52#define ALLOC_GROUPS_FROM_GLOBAL 0x2
53
54#define OCFS2_MAX_TO_STEAL 1024
55
56static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
57static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
58static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
59static int ocfs2_block_group_fill(handle_t *handle,
60                  struct inode *alloc_inode,
61                  struct buffer_head *bg_bh,
62                  u64 group_blkno,
63                  u16 my_chain,
64                  struct ocfs2_chain_list *cl);
65static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
66                   struct inode *alloc_inode,
67                   struct buffer_head *bh,
68                   u64 max_block,
69                   u64 *last_alloc_group,
70                   int flags);
71
72static int ocfs2_cluster_group_search(struct inode *inode,
73                      struct buffer_head *group_bh,
74                      u32 bits_wanted, u32 min_bits,
75                      u64 max_block,
76                      u16 *bit_off, u16 *bits_found);
77static int ocfs2_block_group_search(struct inode *inode,
78                    struct buffer_head *group_bh,
79                    u32 bits_wanted, u32 min_bits,
80                    u64 max_block,
81                    u16 *bit_off, u16 *bits_found);
82static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
83                     struct ocfs2_alloc_context *ac,
84                     handle_t *handle,
85                     u32 bits_wanted,
86                     u32 min_bits,
87                     u16 *bit_off,
88                     unsigned int *num_bits,
89                     u64 *bg_blkno);
90static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
91                     int nr);
92static inline int ocfs2_block_group_set_bits(handle_t *handle,
93                         struct inode *alloc_inode,
94                         struct ocfs2_group_desc *bg,
95                         struct buffer_head *group_bh,
96                         unsigned int bit_off,
97                         unsigned int num_bits);
98static int ocfs2_relink_block_group(handle_t *handle,
99                    struct inode *alloc_inode,
100                    struct buffer_head *fe_bh,
101                    struct buffer_head *bg_bh,
102                    struct buffer_head *prev_bg_bh,
103                    u16 chain);
104static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
105                             u32 wanted);
106static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
107                           u64 bg_blkno,
108                           u16 bg_bit_off);
109static inline void ocfs2_block_to_cluster_group(struct inode *inode,
110                        u64 data_blkno,
111                        u64 *bg_blkno,
112                        u16 *bg_bit_off);
113static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
114                         u32 bits_wanted, u64 max_block,
115                         int flags,
116                         struct ocfs2_alloc_context **ac);
117
118void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
119{
120    struct inode *inode = ac->ac_inode;
121
122    if (inode) {
123        if (ac->ac_which != OCFS2_AC_USE_LOCAL)
124            ocfs2_inode_unlock(inode, 1);
125
126        mutex_unlock(&inode->i_mutex);
127
128        iput(inode);
129        ac->ac_inode = NULL;
130    }
131    brelse(ac->ac_bh);
132    ac->ac_bh = NULL;
133}
134
135void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
136{
137    ocfs2_free_ac_resource(ac);
138    kfree(ac);
139}
140
141static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142{
143    return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144}
145
146#define do_error(fmt, ...) \
147    do{ \
148        if (resize) \
149            mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
150        else \
151            ocfs2_error(sb, fmt, ##__VA_ARGS__); \
152    } while (0)
153
154static int ocfs2_validate_gd_self(struct super_block *sb,
155                  struct buffer_head *bh,
156                  int resize)
157{
158    struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
159
160    if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
161        do_error("Group descriptor #%llu has bad signature %.*s",
162             (unsigned long long)bh->b_blocknr, 7,
163             gd->bg_signature);
164        return -EINVAL;
165    }
166
167    if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
168        do_error("Group descriptor #%llu has an invalid bg_blkno "
169             "of %llu",
170             (unsigned long long)bh->b_blocknr,
171             (unsigned long long)le64_to_cpu(gd->bg_blkno));
172        return -EINVAL;
173    }
174
175    if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
176        do_error("Group descriptor #%llu has an invalid "
177             "fs_generation of #%u",
178             (unsigned long long)bh->b_blocknr,
179             le32_to_cpu(gd->bg_generation));
180        return -EINVAL;
181    }
182
183    if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
184        do_error("Group descriptor #%llu has bit count %u but "
185             "claims that %u are free",
186             (unsigned long long)bh->b_blocknr,
187             le16_to_cpu(gd->bg_bits),
188             le16_to_cpu(gd->bg_free_bits_count));
189        return -EINVAL;
190    }
191
192    if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
193        do_error("Group descriptor #%llu has bit count %u but "
194             "max bitmap bits of %u",
195             (unsigned long long)bh->b_blocknr,
196             le16_to_cpu(gd->bg_bits),
197             8 * le16_to_cpu(gd->bg_size));
198        return -EINVAL;
199    }
200
201    return 0;
202}
203
204static int ocfs2_validate_gd_parent(struct super_block *sb,
205                    struct ocfs2_dinode *di,
206                    struct buffer_head *bh,
207                    int resize)
208{
209    unsigned int max_bits;
210    struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
211
212    if (di->i_blkno != gd->bg_parent_dinode) {
213        do_error("Group descriptor #%llu has bad parent "
214             "pointer (%llu, expected %llu)",
215             (unsigned long long)bh->b_blocknr,
216             (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
217             (unsigned long long)le64_to_cpu(di->i_blkno));
218        return -EINVAL;
219    }
220
221    max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
222    if (le16_to_cpu(gd->bg_bits) > max_bits) {
223        do_error("Group descriptor #%llu has bit count of %u",
224             (unsigned long long)bh->b_blocknr,
225             le16_to_cpu(gd->bg_bits));
226        return -EINVAL;
227    }
228
229    /* In resize, we may meet the case bg_chain == cl_next_free_rec. */
230    if ((le16_to_cpu(gd->bg_chain) >
231         le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
232        ((le16_to_cpu(gd->bg_chain) ==
233         le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
234        do_error("Group descriptor #%llu has bad chain %u",
235             (unsigned long long)bh->b_blocknr,
236             le16_to_cpu(gd->bg_chain));
237        return -EINVAL;
238    }
239
240    return 0;
241}
242
243#undef do_error
244
245/*
246 * This version only prints errors. It does not fail the filesystem, and
247 * exists only for resize.
248 */
249int ocfs2_check_group_descriptor(struct super_block *sb,
250                 struct ocfs2_dinode *di,
251                 struct buffer_head *bh)
252{
253    int rc;
254    struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
255
256    BUG_ON(!buffer_uptodate(bh));
257
258    /*
259     * If the ecc fails, we return the error but otherwise
260     * leave the filesystem running. We know any error is
261     * local to this block.
262     */
263    rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
264    if (rc) {
265        mlog(ML_ERROR,
266             "Checksum failed for group descriptor %llu\n",
267             (unsigned long long)bh->b_blocknr);
268    } else
269        rc = ocfs2_validate_gd_self(sb, bh, 1);
270    if (!rc)
271        rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
272
273    return rc;
274}
275
276static int ocfs2_validate_group_descriptor(struct super_block *sb,
277                       struct buffer_head *bh)
278{
279    int rc;
280    struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
281
282    mlog(0, "Validating group descriptor %llu\n",
283         (unsigned long long)bh->b_blocknr);
284
285    BUG_ON(!buffer_uptodate(bh));
286
287    /*
288     * If the ecc fails, we return the error but otherwise
289     * leave the filesystem running. We know any error is
290     * local to this block.
291     */
292    rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
293    if (rc)
294        return rc;
295
296    /*
297     * Errors after here are fatal.
298     */
299
300    return ocfs2_validate_gd_self(sb, bh, 0);
301}
302
303int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
304                u64 gd_blkno, struct buffer_head **bh)
305{
306    int rc;
307    struct buffer_head *tmp = *bh;
308
309    rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
310                  ocfs2_validate_group_descriptor);
311    if (rc)
312        goto out;
313
314    rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
315    if (rc) {
316        brelse(tmp);
317        goto out;
318    }
319
320    /* If ocfs2_read_block() got us a new bh, pass it up. */
321    if (!*bh)
322        *bh = tmp;
323
324out:
325    return rc;
326}
327
328static int ocfs2_block_group_fill(handle_t *handle,
329                  struct inode *alloc_inode,
330                  struct buffer_head *bg_bh,
331                  u64 group_blkno,
332                  u16 my_chain,
333                  struct ocfs2_chain_list *cl)
334{
335    int status = 0;
336    struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
337    struct super_block * sb = alloc_inode->i_sb;
338
339    mlog_entry_void();
340
341    if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
342        ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
343                "b_blocknr (%llu)",
344                (unsigned long long)group_blkno,
345                (unsigned long long) bg_bh->b_blocknr);
346        status = -EIO;
347        goto bail;
348    }
349
350    status = ocfs2_journal_access_gd(handle,
351                     INODE_CACHE(alloc_inode),
352                     bg_bh,
353                     OCFS2_JOURNAL_ACCESS_CREATE);
354    if (status < 0) {
355        mlog_errno(status);
356        goto bail;
357    }
358
359    memset(bg, 0, sb->s_blocksize);
360    strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
361    bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
362    bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
363    bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
364    bg->bg_chain = cpu_to_le16(my_chain);
365    bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
366    bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
367    bg->bg_blkno = cpu_to_le64(group_blkno);
368    /* set the 1st bit in the bitmap to account for the descriptor block */
369    ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
370    bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
371
372    status = ocfs2_journal_dirty(handle, bg_bh);
373    if (status < 0)
374        mlog_errno(status);
375
376    /* There is no need to zero out or otherwise initialize the
377     * other blocks in a group - All valid FS metadata in a block
378     * group stores the superblock fs_generation value at
379     * allocation time. */
380
381bail:
382    mlog_exit(status);
383    return status;
384}
385
386static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
387{
388    u16 curr, best;
389
390    best = curr = 0;
391    while (curr < le16_to_cpu(cl->cl_count)) {
392        if (le32_to_cpu(cl->cl_recs[best].c_total) >
393            le32_to_cpu(cl->cl_recs[curr].c_total))
394            best = curr;
395        curr++;
396    }
397    return best;
398}
399
400/*
401 * We expect the block group allocator to already be locked.
402 */
403static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
404                   struct inode *alloc_inode,
405                   struct buffer_head *bh,
406                   u64 max_block,
407                   u64 *last_alloc_group,
408                   int flags)
409{
410    int status, credits;
411    struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
412    struct ocfs2_chain_list *cl;
413    struct ocfs2_alloc_context *ac = NULL;
414    handle_t *handle = NULL;
415    u32 bit_off, num_bits;
416    u16 alloc_rec;
417    u64 bg_blkno;
418    struct buffer_head *bg_bh = NULL;
419    struct ocfs2_group_desc *bg;
420
421    BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
422
423    mlog_entry_void();
424
425    cl = &fe->id2.i_chain;
426    status = ocfs2_reserve_clusters_with_limit(osb,
427                           le16_to_cpu(cl->cl_cpg),
428                           max_block, flags, &ac);
429    if (status < 0) {
430        if (status != -ENOSPC)
431            mlog_errno(status);
432        goto bail;
433    }
434
435    credits = ocfs2_calc_group_alloc_credits(osb->sb,
436                         le16_to_cpu(cl->cl_cpg));
437    handle = ocfs2_start_trans(osb, credits);
438    if (IS_ERR(handle)) {
439        status = PTR_ERR(handle);
440        handle = NULL;
441        mlog_errno(status);
442        goto bail;
443    }
444
445    if (last_alloc_group && *last_alloc_group != 0) {
446        mlog(0, "use old allocation group %llu for block group alloc\n",
447             (unsigned long long)*last_alloc_group);
448        ac->ac_last_group = *last_alloc_group;
449    }
450    status = ocfs2_claim_clusters(osb,
451                      handle,
452                      ac,
453                      le16_to_cpu(cl->cl_cpg),
454                      &bit_off,
455                      &num_bits);
456    if (status < 0) {
457        if (status != -ENOSPC)
458            mlog_errno(status);
459        goto bail;
460    }
461
462    alloc_rec = ocfs2_find_smallest_chain(cl);
463
464    /* setup the group */
465    bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
466    mlog(0, "new descriptor, record %u, at block %llu\n",
467         alloc_rec, (unsigned long long)bg_blkno);
468
469    bg_bh = sb_getblk(osb->sb, bg_blkno);
470    if (!bg_bh) {
471        status = -EIO;
472        mlog_errno(status);
473        goto bail;
474    }
475    ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
476
477    status = ocfs2_block_group_fill(handle,
478                    alloc_inode,
479                    bg_bh,
480                    bg_blkno,
481                    alloc_rec,
482                    cl);
483    if (status < 0) {
484        mlog_errno(status);
485        goto bail;
486    }
487
488    bg = (struct ocfs2_group_desc *) bg_bh->b_data;
489
490    status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
491                     bh, OCFS2_JOURNAL_ACCESS_WRITE);
492    if (status < 0) {
493        mlog_errno(status);
494        goto bail;
495    }
496
497    le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
498             le16_to_cpu(bg->bg_free_bits_count));
499    le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
500    cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno);
501    if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
502        le16_add_cpu(&cl->cl_next_free_rec, 1);
503
504    le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
505                    le16_to_cpu(bg->bg_free_bits_count));
506    le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
507    le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
508
509    status = ocfs2_journal_dirty(handle, bh);
510    if (status < 0) {
511        mlog_errno(status);
512        goto bail;
513    }
514
515    spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
516    OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
517    fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
518                         le32_to_cpu(fe->i_clusters)));
519    spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
520    i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
521    alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
522
523    status = 0;
524
525    /* save the new last alloc group so that the caller can cache it. */
526    if (last_alloc_group)
527        *last_alloc_group = ac->ac_last_group;
528
529bail:
530    if (handle)
531        ocfs2_commit_trans(osb, handle);
532
533    if (ac)
534        ocfs2_free_alloc_context(ac);
535
536    brelse(bg_bh);
537
538    mlog_exit(status);
539    return status;
540}
541
542static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
543                       struct ocfs2_alloc_context *ac,
544                       int type,
545                       u32 slot,
546                       u64 *last_alloc_group,
547                       int flags)
548{
549    int status;
550    u32 bits_wanted = ac->ac_bits_wanted;
551    struct inode *alloc_inode;
552    struct buffer_head *bh = NULL;
553    struct ocfs2_dinode *fe;
554    u32 free_bits;
555
556    mlog_entry_void();
557
558    alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
559    if (!alloc_inode) {
560        mlog_errno(-EINVAL);
561        return -EINVAL;
562    }
563
564    mutex_lock(&alloc_inode->i_mutex);
565
566    status = ocfs2_inode_lock(alloc_inode, &bh, 1);
567    if (status < 0) {
568        mutex_unlock(&alloc_inode->i_mutex);
569        iput(alloc_inode);
570
571        mlog_errno(status);
572        return status;
573    }
574
575    ac->ac_inode = alloc_inode;
576    ac->ac_alloc_slot = slot;
577
578    fe = (struct ocfs2_dinode *) bh->b_data;
579
580    /* The bh was validated by the inode read inside
581     * ocfs2_inode_lock(). Any corruption is a code bug. */
582    BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
583
584    if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
585        ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
586                (unsigned long long)le64_to_cpu(fe->i_blkno));
587        status = -EIO;
588        goto bail;
589    }
590
591    free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
592        le32_to_cpu(fe->id1.bitmap1.i_used);
593
594    if (bits_wanted > free_bits) {
595        /* cluster bitmap never grows */
596        if (ocfs2_is_cluster_bitmap(alloc_inode)) {
597            mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
598                 bits_wanted, free_bits);
599            status = -ENOSPC;
600            goto bail;
601        }
602
603        if (!(flags & ALLOC_NEW_GROUP)) {
604            mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
605                 "and we don't alloc a new group for it.\n",
606                 slot, bits_wanted, free_bits);
607            status = -ENOSPC;
608            goto bail;
609        }
610
611        status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
612                         ac->ac_max_block,
613                         last_alloc_group, flags);
614        if (status < 0) {
615            if (status != -ENOSPC)
616                mlog_errno(status);
617            goto bail;
618        }
619        atomic_inc(&osb->alloc_stats.bg_extends);
620
621        /* You should never ask for this much metadata */
622        BUG_ON(bits_wanted >
623               (le32_to_cpu(fe->id1.bitmap1.i_total)
624            - le32_to_cpu(fe->id1.bitmap1.i_used)));
625    }
626
627    get_bh(bh);
628    ac->ac_bh = bh;
629bail:
630    brelse(bh);
631
632    mlog_exit(status);
633    return status;
634}
635
636static void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
637{
638    spin_lock(&osb->osb_lock);
639    osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
640    spin_unlock(&osb->osb_lock);
641    atomic_set(&osb->s_num_inodes_stolen, 0);
642}
643
644static void ocfs2_init_meta_steal_slot(struct ocfs2_super *osb)
645{
646    spin_lock(&osb->osb_lock);
647    osb->s_meta_steal_slot = OCFS2_INVALID_SLOT;
648    spin_unlock(&osb->osb_lock);
649    atomic_set(&osb->s_num_meta_stolen, 0);
650}
651
652void ocfs2_init_steal_slots(struct ocfs2_super *osb)
653{
654    ocfs2_init_inode_steal_slot(osb);
655    ocfs2_init_meta_steal_slot(osb);
656}
657
658static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
659{
660    spin_lock(&osb->osb_lock);
661    if (type == INODE_ALLOC_SYSTEM_INODE)
662        osb->s_inode_steal_slot = slot;
663    else if (type == EXTENT_ALLOC_SYSTEM_INODE)
664        osb->s_meta_steal_slot = slot;
665    spin_unlock(&osb->osb_lock);
666}
667
668static int __ocfs2_get_steal_slot(struct ocfs2_super *osb, int type)
669{
670    int slot = OCFS2_INVALID_SLOT;
671
672    spin_lock(&osb->osb_lock);
673    if (type == INODE_ALLOC_SYSTEM_INODE)
674        slot = osb->s_inode_steal_slot;
675    else if (type == EXTENT_ALLOC_SYSTEM_INODE)
676        slot = osb->s_meta_steal_slot;
677    spin_unlock(&osb->osb_lock);
678
679    return slot;
680}
681
682static int ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
683{
684    return __ocfs2_get_steal_slot(osb, INODE_ALLOC_SYSTEM_INODE);
685}
686
687static int ocfs2_get_meta_steal_slot(struct ocfs2_super *osb)
688{
689    return __ocfs2_get_steal_slot(osb, EXTENT_ALLOC_SYSTEM_INODE);
690}
691
692static int ocfs2_steal_resource(struct ocfs2_super *osb,
693                struct ocfs2_alloc_context *ac,
694                int type)
695{
696    int i, status = -ENOSPC;
697    int slot = __ocfs2_get_steal_slot(osb, type);
698
699    /* Start to steal resource from the first slot after ours. */
700    if (slot == OCFS2_INVALID_SLOT)
701        slot = osb->slot_num + 1;
702
703    for (i = 0; i < osb->max_slots; i++, slot++) {
704        if (slot == osb->max_slots)
705            slot = 0;
706
707        if (slot == osb->slot_num)
708            continue;
709
710        status = ocfs2_reserve_suballoc_bits(osb, ac,
711                             type,
712                             (u32)slot, NULL,
713                             NOT_ALLOC_NEW_GROUP);
714        if (status >= 0) {
715            __ocfs2_set_steal_slot(osb, slot, type);
716            break;
717        }
718
719        ocfs2_free_ac_resource(ac);
720    }
721
722    return status;
723}
724
725static int ocfs2_steal_inode(struct ocfs2_super *osb,
726                 struct ocfs2_alloc_context *ac)
727{
728    return ocfs2_steal_resource(osb, ac, INODE_ALLOC_SYSTEM_INODE);
729}
730
731static int ocfs2_steal_meta(struct ocfs2_super *osb,
732                struct ocfs2_alloc_context *ac)
733{
734    return ocfs2_steal_resource(osb, ac, EXTENT_ALLOC_SYSTEM_INODE);
735}
736
737int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
738                      int blocks,
739                      struct ocfs2_alloc_context **ac)
740{
741    int status;
742    int slot = ocfs2_get_meta_steal_slot(osb);
743
744    *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
745    if (!(*ac)) {
746        status = -ENOMEM;
747        mlog_errno(status);
748        goto bail;
749    }
750
751    (*ac)->ac_bits_wanted = blocks;
752    (*ac)->ac_which = OCFS2_AC_USE_META;
753    (*ac)->ac_group_search = ocfs2_block_group_search;
754
755    if (slot != OCFS2_INVALID_SLOT &&
756        atomic_read(&osb->s_num_meta_stolen) < OCFS2_MAX_TO_STEAL)
757        goto extent_steal;
758
759    atomic_set(&osb->s_num_meta_stolen, 0);
760    status = ocfs2_reserve_suballoc_bits(osb, (*ac),
761                         EXTENT_ALLOC_SYSTEM_INODE,
762                         (u32)osb->slot_num, NULL,
763                         ALLOC_NEW_GROUP);
764
765
766    if (status >= 0) {
767        status = 0;
768        if (slot != OCFS2_INVALID_SLOT)
769            ocfs2_init_meta_steal_slot(osb);
770        goto bail;
771    } else if (status < 0 && status != -ENOSPC) {
772        mlog_errno(status);
773        goto bail;
774    }
775
776    ocfs2_free_ac_resource(*ac);
777
778extent_steal:
779    status = ocfs2_steal_meta(osb, *ac);
780    atomic_inc(&osb->s_num_meta_stolen);
781    if (status < 0) {
782        if (status != -ENOSPC)
783            mlog_errno(status);
784        goto bail;
785    }
786
787    status = 0;
788bail:
789    if ((status < 0) && *ac) {
790        ocfs2_free_alloc_context(*ac);
791        *ac = NULL;
792    }
793
794    mlog_exit(status);
795    return status;
796}
797
798int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
799                   struct ocfs2_extent_list *root_el,
800                   struct ocfs2_alloc_context **ac)
801{
802    return ocfs2_reserve_new_metadata_blocks(osb,
803                    ocfs2_extend_meta_needed(root_el),
804                    ac);
805}
806
807int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
808                struct ocfs2_alloc_context **ac)
809{
810    int status;
811    int slot = ocfs2_get_inode_steal_slot(osb);
812    u64 alloc_group;
813
814    *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
815    if (!(*ac)) {
816        status = -ENOMEM;
817        mlog_errno(status);
818        goto bail;
819    }
820
821    (*ac)->ac_bits_wanted = 1;
822    (*ac)->ac_which = OCFS2_AC_USE_INODE;
823
824    (*ac)->ac_group_search = ocfs2_block_group_search;
825
826    /*
827     * stat(2) can't handle i_ino > 32bits, so we tell the
828     * lower levels not to allocate us a block group past that
829     * limit. The 'inode64' mount option avoids this behavior.
830     */
831    if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
832        (*ac)->ac_max_block = (u32)~0U;
833
834    /*
835     * slot is set when we successfully steal inode from other nodes.
836     * It is reset in 3 places:
837     * 1. when we flush the truncate log
838     * 2. when we complete local alloc recovery.
839     * 3. when we successfully allocate from our own slot.
840     * After it is set, we will go on stealing inodes until we find the
841     * need to check our slots to see whether there is some space for us.
842     */
843    if (slot != OCFS2_INVALID_SLOT &&
844        atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_TO_STEAL)
845        goto inode_steal;
846
847    atomic_set(&osb->s_num_inodes_stolen, 0);
848    alloc_group = osb->osb_inode_alloc_group;
849    status = ocfs2_reserve_suballoc_bits(osb, *ac,
850                         INODE_ALLOC_SYSTEM_INODE,
851                         (u32)osb->slot_num,
852                         &alloc_group,
853                         ALLOC_NEW_GROUP |
854                         ALLOC_GROUPS_FROM_GLOBAL);
855    if (status >= 0) {
856        status = 0;
857
858        spin_lock(&osb->osb_lock);
859        osb->osb_inode_alloc_group = alloc_group;
860        spin_unlock(&osb->osb_lock);
861        mlog(0, "after reservation, new allocation group is "
862             "%llu\n", (unsigned long long)alloc_group);
863
864        /*
865         * Some inodes must be freed by us, so try to allocate
866         * from our own next time.
867         */
868        if (slot != OCFS2_INVALID_SLOT)
869            ocfs2_init_inode_steal_slot(osb);
870        goto bail;
871    } else if (status < 0 && status != -ENOSPC) {
872        mlog_errno(status);
873        goto bail;
874    }
875
876    ocfs2_free_ac_resource(*ac);
877
878inode_steal:
879    status = ocfs2_steal_inode(osb, *ac);
880    atomic_inc(&osb->s_num_inodes_stolen);
881    if (status < 0) {
882        if (status != -ENOSPC)
883            mlog_errno(status);
884        goto bail;
885    }
886
887    status = 0;
888bail:
889    if ((status < 0) && *ac) {
890        ocfs2_free_alloc_context(*ac);
891        *ac = NULL;
892    }
893
894    mlog_exit(status);
895    return status;
896}
897
898/* local alloc code has to do the same thing, so rather than do this
899 * twice.. */
900int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
901                      struct ocfs2_alloc_context *ac)
902{
903    int status;
904
905    ac->ac_which = OCFS2_AC_USE_MAIN;
906    ac->ac_group_search = ocfs2_cluster_group_search;
907
908    status = ocfs2_reserve_suballoc_bits(osb, ac,
909                         GLOBAL_BITMAP_SYSTEM_INODE,
910                         OCFS2_INVALID_SLOT, NULL,
911                         ALLOC_NEW_GROUP);
912    if (status < 0 && status != -ENOSPC) {
913        mlog_errno(status);
914        goto bail;
915    }
916
917bail:
918    return status;
919}
920
921/* Callers don't need to care which bitmap (local alloc or main) to
922 * use so we figure it out for them, but unfortunately this clutters
923 * things a bit. */
924static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
925                         u32 bits_wanted, u64 max_block,
926                         int flags,
927                         struct ocfs2_alloc_context **ac)
928{
929    int status;
930
931    mlog_entry_void();
932
933    *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
934    if (!(*ac)) {
935        status = -ENOMEM;
936        mlog_errno(status);
937        goto bail;
938    }
939
940    (*ac)->ac_bits_wanted = bits_wanted;
941    (*ac)->ac_max_block = max_block;
942
943    status = -ENOSPC;
944    if (!(flags & ALLOC_GROUPS_FROM_GLOBAL) &&
945        ocfs2_alloc_should_use_local(osb, bits_wanted)) {
946        status = ocfs2_reserve_local_alloc_bits(osb,
947                            bits_wanted,
948                            *ac);
949        if (status == -EFBIG) {
950            /* The local alloc window is outside ac_max_block.
951             * use the main bitmap. */
952            status = -ENOSPC;
953        } else if ((status < 0) && (status != -ENOSPC)) {
954            mlog_errno(status);
955            goto bail;
956        }
957    }
958
959    if (status == -ENOSPC) {
960        status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
961        if (status < 0) {
962            if (status != -ENOSPC)
963                mlog_errno(status);
964            goto bail;
965        }
966    }
967
968    status = 0;
969bail:
970    if ((status < 0) && *ac) {
971        ocfs2_free_alloc_context(*ac);
972        *ac = NULL;
973    }
974
975    mlog_exit(status);
976    return status;
977}
978
979int ocfs2_reserve_clusters(struct ocfs2_super *osb,
980               u32 bits_wanted,
981               struct ocfs2_alloc_context **ac)
982{
983    return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0,
984                         ALLOC_NEW_GROUP, ac);
985}
986
987/*
988 * More or less lifted from ext3. I'll leave their description below:
989 *
990 * "For ext3 allocations, we must not reuse any blocks which are
991 * allocated in the bitmap buffer's "last committed data" copy. This
992 * prevents deletes from freeing up the page for reuse until we have
993 * committed the delete transaction.
994 *
995 * If we didn't do this, then deleting something and reallocating it as
996 * data would allow the old block to be overwritten before the
997 * transaction committed (because we force data to disk before commit).
998 * This would lead to corruption if we crashed between overwriting the
999 * data and committing the delete.
1000 *
1001 * @@@ We may want to make this allocation behaviour conditional on
1002 * data-writes at some point, and disable it for metadata allocations or
1003 * sync-data inodes."
1004 *
1005 * Note: OCFS2 already does this differently for metadata vs data
1006 * allocations, as those bitmaps are separate and undo access is never
1007 * called on a metadata group descriptor.
1008 */
1009static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
1010                     int nr)
1011{
1012    struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1013    int ret;
1014
1015    if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
1016        return 0;
1017
1018    if (!buffer_jbd(bg_bh))
1019        return 1;
1020
1021    jbd_lock_bh_state(bg_bh);
1022    bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
1023    if (bg)
1024        ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
1025    else
1026        ret = 1;
1027    jbd_unlock_bh_state(bg_bh);
1028
1029    return ret;
1030}
1031
1032static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
1033                         struct buffer_head *bg_bh,
1034                         unsigned int bits_wanted,
1035                         unsigned int total_bits,
1036                         u16 *bit_off,
1037                         u16 *bits_found)
1038{
1039    void *bitmap;
1040    u16 best_offset, best_size;
1041    int offset, start, found, status = 0;
1042    struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1043
1044    /* Callers got this descriptor from
1045     * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1046    BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1047
1048    found = start = best_offset = best_size = 0;
1049    bitmap = bg->bg_bitmap;
1050
1051    while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
1052        if (offset == total_bits)
1053            break;
1054
1055        if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
1056            /* We found a zero, but we can't use it as it
1057             * hasn't been put to disk yet! */
1058            found = 0;
1059            start = offset + 1;
1060        } else if (offset == start) {
1061            /* we found a zero */
1062            found++;
1063            /* move start to the next bit to test */
1064            start++;
1065        } else {
1066            /* got a zero after some ones */
1067            found = 1;
1068            start = offset + 1;
1069        }
1070        if (found > best_size) {
1071            best_size = found;
1072            best_offset = start - found;
1073        }
1074        /* we got everything we needed */
1075        if (found == bits_wanted) {
1076            /* mlog(0, "Found it all!\n"); */
1077            break;
1078        }
1079    }
1080
1081    /* XXX: I think the first clause is equivalent to the second
1082     * - jlbec */
1083    if (found == bits_wanted) {
1084        *bit_off = start - found;
1085        *bits_found = found;
1086    } else if (best_size) {
1087        *bit_off = best_offset;
1088        *bits_found = best_size;
1089    } else {
1090        status = -ENOSPC;
1091        /* No error log here -- see the comment above
1092         * ocfs2_test_bg_bit_allocatable */
1093    }
1094
1095    return status;
1096}
1097
1098static inline int ocfs2_block_group_set_bits(handle_t *handle,
1099                         struct inode *alloc_inode,
1100                         struct ocfs2_group_desc *bg,
1101                         struct buffer_head *group_bh,
1102                         unsigned int bit_off,
1103                         unsigned int num_bits)
1104{
1105    int status;
1106    void *bitmap = bg->bg_bitmap;
1107    int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1108
1109    mlog_entry_void();
1110
1111    /* All callers get the descriptor via
1112     * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1113    BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1114    BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1115
1116    mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
1117         num_bits);
1118
1119    if (ocfs2_is_cluster_bitmap(alloc_inode))
1120        journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1121
1122    status = ocfs2_journal_access_gd(handle,
1123                     INODE_CACHE(alloc_inode),
1124                     group_bh,
1125                     journal_type);
1126    if (status < 0) {
1127        mlog_errno(status);
1128        goto bail;
1129    }
1130
1131    le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1132
1133    while(num_bits--)
1134        ocfs2_set_bit(bit_off++, bitmap);
1135
1136    status = ocfs2_journal_dirty(handle,
1137                     group_bh);
1138    if (status < 0) {
1139        mlog_errno(status);
1140        goto bail;
1141    }
1142
1143bail:
1144    mlog_exit(status);
1145    return status;
1146}
1147
1148/* find the one with the most empty bits */
1149static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1150{
1151    u16 curr, best;
1152
1153    BUG_ON(!cl->cl_next_free_rec);
1154
1155    best = curr = 0;
1156    while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1157        if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1158            le32_to_cpu(cl->cl_recs[best].c_free))
1159            best = curr;
1160        curr++;
1161    }
1162
1163    BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1164    return best;
1165}
1166
1167static int ocfs2_relink_block_group(handle_t *handle,
1168                    struct inode *alloc_inode,
1169                    struct buffer_head *fe_bh,
1170                    struct buffer_head *bg_bh,
1171                    struct buffer_head *prev_bg_bh,
1172                    u16 chain)
1173{
1174    int status;
1175    /* there is a really tiny chance the journal calls could fail,
1176     * but we wouldn't want inconsistent blocks in *any* case. */
1177    u64 fe_ptr, bg_ptr, prev_bg_ptr;
1178    struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1179    struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1180    struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1181
1182    /* The caller got these descriptors from
1183     * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1184    BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1185    BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1186
1187    mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1188         (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1189         (unsigned long long)le64_to_cpu(bg->bg_blkno),
1190         (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1191
1192    fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1193    bg_ptr = le64_to_cpu(bg->bg_next_group);
1194    prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1195
1196    status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1197                     prev_bg_bh,
1198                     OCFS2_JOURNAL_ACCESS_WRITE);
1199    if (status < 0) {
1200        mlog_errno(status);
1201        goto out_rollback;
1202    }
1203
1204    prev_bg->bg_next_group = bg->bg_next_group;
1205
1206    status = ocfs2_journal_dirty(handle, prev_bg_bh);
1207    if (status < 0) {
1208        mlog_errno(status);
1209        goto out_rollback;
1210    }
1211
1212    status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1213                     bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1214    if (status < 0) {
1215        mlog_errno(status);
1216        goto out_rollback;
1217    }
1218
1219    bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1220
1221    status = ocfs2_journal_dirty(handle, bg_bh);
1222    if (status < 0) {
1223        mlog_errno(status);
1224        goto out_rollback;
1225    }
1226
1227    status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1228                     fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1229    if (status < 0) {
1230        mlog_errno(status);
1231        goto out_rollback;
1232    }
1233
1234    fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1235
1236    status = ocfs2_journal_dirty(handle, fe_bh);
1237    if (status < 0) {
1238        mlog_errno(status);
1239        goto out_rollback;
1240    }
1241
1242    status = 0;
1243out_rollback:
1244    if (status < 0) {
1245        fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1246        bg->bg_next_group = cpu_to_le64(bg_ptr);
1247        prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1248    }
1249
1250    mlog_exit(status);
1251    return status;
1252}
1253
1254static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1255                             u32 wanted)
1256{
1257    return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1258}
1259
1260/* return 0 on success, -ENOSPC to keep searching and any other < 0
1261 * value on error. */
1262static int ocfs2_cluster_group_search(struct inode *inode,
1263                      struct buffer_head *group_bh,
1264                      u32 bits_wanted, u32 min_bits,
1265                      u64 max_block,
1266                      u16 *bit_off, u16 *bits_found)
1267{
1268    int search = -ENOSPC;
1269    int ret;
1270    u64 blkoff;
1271    struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1272    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1273    u16 tmp_off, tmp_found;
1274    unsigned int max_bits, gd_cluster_off;
1275
1276    BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1277
1278    if (gd->bg_free_bits_count) {
1279        max_bits = le16_to_cpu(gd->bg_bits);
1280
1281        /* Tail groups in cluster bitmaps which aren't cpg
1282         * aligned are prone to partial extention by a failed
1283         * fs resize. If the file system resize never got to
1284         * update the dinode cluster count, then we don't want
1285         * to trust any clusters past it, regardless of what
1286         * the group descriptor says. */
1287        gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1288                              le64_to_cpu(gd->bg_blkno));
1289        if ((gd_cluster_off + max_bits) >
1290            OCFS2_I(inode)->ip_clusters) {
1291            max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1292            mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1293                 (unsigned long long)le64_to_cpu(gd->bg_blkno),
1294                 le16_to_cpu(gd->bg_bits),
1295                 OCFS2_I(inode)->ip_clusters, max_bits);
1296        }
1297
1298        ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1299                            group_bh, bits_wanted,
1300                            max_bits,
1301                            &tmp_off, &tmp_found);
1302        if (ret)
1303            return ret;
1304
1305        if (max_block) {
1306            blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1307                              gd_cluster_off +
1308                              tmp_off + tmp_found);
1309            mlog(0, "Checking %llu against %llu\n",
1310                 (unsigned long long)blkoff,
1311                 (unsigned long long)max_block);
1312            if (blkoff > max_block)
1313                return -ENOSPC;
1314        }
1315
1316        /* ocfs2_block_group_find_clear_bits() might
1317         * return success, but we still want to return
1318         * -ENOSPC unless it found the minimum number
1319         * of bits. */
1320        if (min_bits <= tmp_found) {
1321            *bit_off = tmp_off;
1322            *bits_found = tmp_found;
1323            search = 0; /* success */
1324        } else if (tmp_found) {
1325            /*
1326             * Don't show bits which we'll be returning
1327             * for allocation to the local alloc bitmap.
1328             */
1329            ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1330        }
1331    }
1332
1333    return search;
1334}
1335
1336static int ocfs2_block_group_search(struct inode *inode,
1337                    struct buffer_head *group_bh,
1338                    u32 bits_wanted, u32 min_bits,
1339                    u64 max_block,
1340                    u16 *bit_off, u16 *bits_found)
1341{
1342    int ret = -ENOSPC;
1343    u64 blkoff;
1344    struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1345
1346    BUG_ON(min_bits != 1);
1347    BUG_ON(ocfs2_is_cluster_bitmap(inode));
1348
1349    if (bg->bg_free_bits_count) {
1350        ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1351                            group_bh, bits_wanted,
1352                            le16_to_cpu(bg->bg_bits),
1353                            bit_off, bits_found);
1354        if (!ret && max_block) {
1355            blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1356                *bits_found;
1357            mlog(0, "Checking %llu against %llu\n",
1358                 (unsigned long long)blkoff,
1359                 (unsigned long long)max_block);
1360            if (blkoff > max_block)
1361                ret = -ENOSPC;
1362        }
1363    }
1364
1365    return ret;
1366}
1367
1368static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1369                       handle_t *handle,
1370                       struct buffer_head *di_bh,
1371                       u32 num_bits,
1372                       u16 chain)
1373{
1374    int ret;
1375    u32 tmp_used;
1376    struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1377    struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1378
1379    ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1380                      OCFS2_JOURNAL_ACCESS_WRITE);
1381    if (ret < 0) {
1382        mlog_errno(ret);
1383        goto out;
1384    }
1385
1386    tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1387    di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1388    le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1389
1390    ret = ocfs2_journal_dirty(handle, di_bh);
1391    if (ret < 0)
1392        mlog_errno(ret);
1393
1394out:
1395    return ret;
1396}
1397
1398static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1399                  handle_t *handle,
1400                  u32 bits_wanted,
1401                  u32 min_bits,
1402                  u16 *bit_off,
1403                  unsigned int *num_bits,
1404                  u64 gd_blkno,
1405                  u16 *bits_left)
1406{
1407    int ret;
1408    u16 found;
1409    struct buffer_head *group_bh = NULL;
1410    struct ocfs2_group_desc *gd;
1411    struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1412    struct inode *alloc_inode = ac->ac_inode;
1413
1414    ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1415                      &group_bh);
1416    if (ret < 0) {
1417        mlog_errno(ret);
1418        return ret;
1419    }
1420
1421    gd = (struct ocfs2_group_desc *) group_bh->b_data;
1422    ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1423                  ac->ac_max_block, bit_off, &found);
1424    if (ret < 0) {
1425        if (ret != -ENOSPC)
1426            mlog_errno(ret);
1427        goto out;
1428    }
1429
1430    *num_bits = found;
1431
1432    ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1433                           *num_bits,
1434                           le16_to_cpu(gd->bg_chain));
1435    if (ret < 0) {
1436        mlog_errno(ret);
1437        goto out;
1438    }
1439
1440    ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1441                     *bit_off, *num_bits);
1442    if (ret < 0)
1443        mlog_errno(ret);
1444
1445    *bits_left = le16_to_cpu(gd->bg_free_bits_count);
1446
1447out:
1448    brelse(group_bh);
1449
1450    return ret;
1451}
1452
1453static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1454                  handle_t *handle,
1455                  u32 bits_wanted,
1456                  u32 min_bits,
1457                  u16 *bit_off,
1458                  unsigned int *num_bits,
1459                  u64 *bg_blkno,
1460                  u16 *bits_left)
1461{
1462    int status;
1463    u16 chain, tmp_bits;
1464    u32 tmp_used;
1465    u64 next_group;
1466    struct inode *alloc_inode = ac->ac_inode;
1467    struct buffer_head *group_bh = NULL;
1468    struct buffer_head *prev_group_bh = NULL;
1469    struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1470    struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1471    struct ocfs2_group_desc *bg;
1472
1473    chain = ac->ac_chain;
1474    mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1475         bits_wanted, chain,
1476         (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1477
1478    status = ocfs2_read_group_descriptor(alloc_inode, fe,
1479                         le64_to_cpu(cl->cl_recs[chain].c_blkno),
1480                         &group_bh);
1481    if (status < 0) {
1482        mlog_errno(status);
1483        goto bail;
1484    }
1485    bg = (struct ocfs2_group_desc *) group_bh->b_data;
1486
1487    status = -ENOSPC;
1488    /* for now, the chain search is a bit simplistic. We just use
1489     * the 1st group with any empty bits. */
1490    while ((status = ac->ac_group_search(alloc_inode, group_bh,
1491                         bits_wanted, min_bits,
1492                         ac->ac_max_block, bit_off,
1493                         &tmp_bits)) == -ENOSPC) {
1494        if (!bg->bg_next_group)
1495            break;
1496
1497        brelse(prev_group_bh);
1498        prev_group_bh = NULL;
1499
1500        next_group = le64_to_cpu(bg->bg_next_group);
1501        prev_group_bh = group_bh;
1502        group_bh = NULL;
1503        status = ocfs2_read_group_descriptor(alloc_inode, fe,
1504                             next_group, &group_bh);
1505        if (status < 0) {
1506            mlog_errno(status);
1507            goto bail;
1508        }
1509        bg = (struct ocfs2_group_desc *) group_bh->b_data;
1510    }
1511    if (status < 0) {
1512        if (status != -ENOSPC)
1513            mlog_errno(status);
1514        goto bail;
1515    }
1516
1517    mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1518         tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1519
1520    *num_bits = tmp_bits;
1521
1522    BUG_ON(*num_bits == 0);
1523
1524    /*
1525     * Keep track of previous block descriptor read. When
1526     * we find a target, if we have read more than X
1527     * number of descriptors, and the target is reasonably
1528     * empty, relink him to top of his chain.
1529     *
1530     * We've read 0 extra blocks and only send one more to
1531     * the transaction, yet the next guy to search has a
1532     * much easier time.
1533     *
1534     * Do this *after* figuring out how many bits we're taking out
1535     * of our target group.
1536     */
1537    if (ac->ac_allow_chain_relink &&
1538        (prev_group_bh) &&
1539        (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1540        status = ocfs2_relink_block_group(handle, alloc_inode,
1541                          ac->ac_bh, group_bh,
1542                          prev_group_bh, chain);
1543        if (status < 0) {
1544            mlog_errno(status);
1545            goto bail;
1546        }
1547    }
1548
1549    /* Ok, claim our bits now: set the info on dinode, chainlist
1550     * and then the group */
1551    status = ocfs2_journal_access_di(handle,
1552                     INODE_CACHE(alloc_inode),
1553                     ac->ac_bh,
1554                     OCFS2_JOURNAL_ACCESS_WRITE);
1555    if (status < 0) {
1556        mlog_errno(status);
1557        goto bail;
1558    }
1559
1560    tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1561    fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1562    le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1563
1564    status = ocfs2_journal_dirty(handle,
1565                     ac->ac_bh);
1566    if (status < 0) {
1567        mlog_errno(status);
1568        goto bail;
1569    }
1570
1571    status = ocfs2_block_group_set_bits(handle,
1572                        alloc_inode,
1573                        bg,
1574                        group_bh,
1575                        *bit_off,
1576                        *num_bits);
1577    if (status < 0) {
1578        mlog_errno(status);
1579        goto bail;
1580    }
1581
1582    mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1583         (unsigned long long)le64_to_cpu(fe->i_blkno));
1584
1585    *bg_blkno = le64_to_cpu(bg->bg_blkno);
1586    *bits_left = le16_to_cpu(bg->bg_free_bits_count);
1587bail:
1588    brelse(group_bh);
1589    brelse(prev_group_bh);
1590
1591    mlog_exit(status);
1592    return status;
1593}
1594
1595/* will give out up to bits_wanted contiguous bits. */
1596static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1597                     struct ocfs2_alloc_context *ac,
1598                     handle_t *handle,
1599                     u32 bits_wanted,
1600                     u32 min_bits,
1601                     u16 *bit_off,
1602                     unsigned int *num_bits,
1603                     u64 *bg_blkno)
1604{
1605    int status;
1606    u16 victim, i;
1607    u16 bits_left = 0;
1608    u64 hint_blkno = ac->ac_last_group;
1609    struct ocfs2_chain_list *cl;
1610    struct ocfs2_dinode *fe;
1611
1612    mlog_entry_void();
1613
1614    BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1615    BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1616    BUG_ON(!ac->ac_bh);
1617
1618    fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1619
1620    /* The bh was validated by the inode read during
1621     * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
1622    BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1623
1624    if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1625        le32_to_cpu(fe->id1.bitmap1.i_total)) {
1626        ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1627                "bits but only %u total.",
1628                (unsigned long long)le64_to_cpu(fe->i_blkno),
1629                le32_to_cpu(fe->id1.bitmap1.i_used),
1630                le32_to_cpu(fe->id1.bitmap1.i_total));
1631        status = -EIO;
1632        goto bail;
1633    }
1634
1635    if (hint_blkno) {
1636        /* Attempt to short-circuit the usual search mechanism
1637         * by jumping straight to the most recently used
1638         * allocation group. This helps us mantain some
1639         * contiguousness across allocations. */
1640        status = ocfs2_search_one_group(ac, handle, bits_wanted,
1641                        min_bits, bit_off, num_bits,
1642                        hint_blkno, &bits_left);
1643        if (!status) {
1644            /* Be careful to update *bg_blkno here as the
1645             * caller is expecting it to be filled in, and
1646             * ocfs2_search_one_group() won't do that for
1647             * us. */
1648            *bg_blkno = hint_blkno;
1649            goto set_hint;
1650        }
1651        if (status < 0 && status != -ENOSPC) {
1652            mlog_errno(status);
1653            goto bail;
1654        }
1655    }
1656
1657    cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1658
1659    victim = ocfs2_find_victim_chain(cl);
1660    ac->ac_chain = victim;
1661    ac->ac_allow_chain_relink = 1;
1662
1663    status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1664                    num_bits, bg_blkno, &bits_left);
1665    if (!status)
1666        goto set_hint;
1667    if (status < 0 && status != -ENOSPC) {
1668        mlog_errno(status);
1669        goto bail;
1670    }
1671
1672    mlog(0, "Search of victim chain %u came up with nothing, "
1673         "trying all chains now.\n", victim);
1674
1675    /* If we didn't pick a good victim, then just default to
1676     * searching each chain in order. Don't allow chain relinking
1677     * because we only calculate enough journal credits for one
1678     * relink per alloc. */
1679    ac->ac_allow_chain_relink = 0;
1680    for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1681        if (i == victim)
1682            continue;
1683        if (!cl->cl_recs[i].c_free)
1684            continue;
1685
1686        ac->ac_chain = i;
1687        status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1688                        bit_off, num_bits, bg_blkno,
1689                        &bits_left);
1690        if (!status)
1691            break;
1692        if (status < 0 && status != -ENOSPC) {
1693            mlog_errno(status);
1694            goto bail;
1695        }
1696    }
1697
1698set_hint:
1699    if (status != -ENOSPC) {
1700        /* If the next search of this group is not likely to
1701         * yield a suitable extent, then we reset the last
1702         * group hint so as to not waste a disk read */
1703        if (bits_left < min_bits)
1704            ac->ac_last_group = 0;
1705        else
1706            ac->ac_last_group = *bg_blkno;
1707    }
1708
1709bail:
1710    mlog_exit(status);
1711    return status;
1712}
1713
1714int ocfs2_claim_metadata(struct ocfs2_super *osb,
1715             handle_t *handle,
1716             struct ocfs2_alloc_context *ac,
1717             u32 bits_wanted,
1718             u16 *suballoc_bit_start,
1719             unsigned int *num_bits,
1720             u64 *blkno_start)
1721{
1722    int status;
1723    u64 bg_blkno;
1724
1725    BUG_ON(!ac);
1726    BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1727    BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1728
1729    status = ocfs2_claim_suballoc_bits(osb,
1730                       ac,
1731                       handle,
1732                       bits_wanted,
1733                       1,
1734                       suballoc_bit_start,
1735                       num_bits,
1736                       &bg_blkno);
1737    if (status < 0) {
1738        mlog_errno(status);
1739        goto bail;
1740    }
1741    atomic_inc(&osb->alloc_stats.bg_allocs);
1742
1743    *blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1744    ac->ac_bits_given += (*num_bits);
1745    status = 0;
1746bail:
1747    mlog_exit(status);
1748    return status;
1749}
1750
1751static void ocfs2_init_inode_ac_group(struct inode *dir,
1752                      struct buffer_head *parent_fe_bh,
1753                      struct ocfs2_alloc_context *ac)
1754{
1755    struct ocfs2_dinode *fe = (struct ocfs2_dinode *)parent_fe_bh->b_data;
1756    /*
1757     * Try to allocate inodes from some specific group.
1758     *
1759     * If the parent dir has recorded the last group used in allocation,
1760     * cool, use it. Otherwise if we try to allocate new inode from the
1761     * same slot the parent dir belongs to, use the same chunk.
1762     *
1763     * We are very careful here to avoid the mistake of setting
1764     * ac_last_group to a group descriptor from a different (unlocked) slot.
1765     */
1766    if (OCFS2_I(dir)->ip_last_used_group &&
1767        OCFS2_I(dir)->ip_last_used_slot == ac->ac_alloc_slot)
1768        ac->ac_last_group = OCFS2_I(dir)->ip_last_used_group;
1769    else if (le16_to_cpu(fe->i_suballoc_slot) == ac->ac_alloc_slot)
1770        ac->ac_last_group = ocfs2_which_suballoc_group(
1771                    le64_to_cpu(fe->i_blkno),
1772                    le16_to_cpu(fe->i_suballoc_bit));
1773}
1774
1775static inline void ocfs2_save_inode_ac_group(struct inode *dir,
1776                         struct ocfs2_alloc_context *ac)
1777{
1778    OCFS2_I(dir)->ip_last_used_group = ac->ac_last_group;
1779    OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot;
1780}
1781
1782int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1783              handle_t *handle,
1784              struct inode *dir,
1785              struct buffer_head *parent_fe_bh,
1786              struct ocfs2_alloc_context *ac,
1787              u16 *suballoc_bit,
1788              u64 *fe_blkno)
1789{
1790    int status;
1791    unsigned int num_bits;
1792    u64 bg_blkno;
1793
1794    mlog_entry_void();
1795
1796    BUG_ON(!ac);
1797    BUG_ON(ac->ac_bits_given != 0);
1798    BUG_ON(ac->ac_bits_wanted != 1);
1799    BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1800
1801    ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac);
1802
1803    status = ocfs2_claim_suballoc_bits(osb,
1804                       ac,
1805                       handle,
1806                       1,
1807                       1,
1808                       suballoc_bit,
1809                       &num_bits,
1810                       &bg_blkno);
1811    if (status < 0) {
1812        mlog_errno(status);
1813        goto bail;
1814    }
1815    atomic_inc(&osb->alloc_stats.bg_allocs);
1816
1817    BUG_ON(num_bits != 1);
1818
1819    *fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1820    ac->ac_bits_given++;
1821    ocfs2_save_inode_ac_group(dir, ac);
1822    status = 0;
1823bail:
1824    mlog_exit(status);
1825    return status;
1826}
1827
1828/* translate a group desc. blkno and it's bitmap offset into
1829 * disk cluster offset. */
1830static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1831                           u64 bg_blkno,
1832                           u16 bg_bit_off)
1833{
1834    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1835    u32 cluster = 0;
1836
1837    BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1838
1839    if (bg_blkno != osb->first_cluster_group_blkno)
1840        cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1841    cluster += (u32) bg_bit_off;
1842    return cluster;
1843}
1844
1845/* given a cluster offset, calculate which block group it belongs to
1846 * and return that block offset. */
1847u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1848{
1849    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1850    u32 group_no;
1851
1852    BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1853
1854    group_no = cluster / osb->bitmap_cpg;
1855    if (!group_no)
1856        return osb->first_cluster_group_blkno;
1857    return ocfs2_clusters_to_blocks(inode->i_sb,
1858                    group_no * osb->bitmap_cpg);
1859}
1860
1861/* given the block number of a cluster start, calculate which cluster
1862 * group and descriptor bitmap offset that corresponds to. */
1863static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1864                        u64 data_blkno,
1865                        u64 *bg_blkno,
1866                        u16 *bg_bit_off)
1867{
1868    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1869    u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1870
1871    BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1872
1873    *bg_blkno = ocfs2_which_cluster_group(inode,
1874                          data_cluster);
1875
1876    if (*bg_blkno == osb->first_cluster_group_blkno)
1877        *bg_bit_off = (u16) data_cluster;
1878    else
1879        *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1880                                 data_blkno - *bg_blkno);
1881}
1882
1883/*
1884 * min_bits - minimum contiguous chunk from this total allocation we
1885 * can handle. set to what we asked for originally for a full
1886 * contig. allocation, set to '1' to indicate we can deal with extents
1887 * of any size.
1888 */
1889int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1890               handle_t *handle,
1891               struct ocfs2_alloc_context *ac,
1892               u32 min_clusters,
1893               u32 max_clusters,
1894               u32 *cluster_start,
1895               u32 *num_clusters)
1896{
1897    int status;
1898    unsigned int bits_wanted = max_clusters;
1899    u64 bg_blkno = 0;
1900    u16 bg_bit_off;
1901
1902    mlog_entry_void();
1903
1904    BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1905
1906    BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1907           && ac->ac_which != OCFS2_AC_USE_MAIN);
1908
1909    if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1910        status = ocfs2_claim_local_alloc_bits(osb,
1911                              handle,
1912                              ac,
1913                              bits_wanted,
1914                              cluster_start,
1915                              num_clusters);
1916        if (!status)
1917            atomic_inc(&osb->alloc_stats.local_data);
1918    } else {
1919        if (min_clusters > (osb->bitmap_cpg - 1)) {
1920            /* The only paths asking for contiguousness
1921             * should know about this already. */
1922            mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1923                 "group bitmap size %u!\n", min_clusters,
1924                 osb->bitmap_cpg);
1925            status = -ENOSPC;
1926            goto bail;
1927        }
1928        /* clamp the current request down to a realistic size. */
1929        if (bits_wanted > (osb->bitmap_cpg - 1))
1930            bits_wanted = osb->bitmap_cpg - 1;
1931
1932        status = ocfs2_claim_suballoc_bits(osb,
1933                           ac,
1934                           handle,
1935                           bits_wanted,
1936                           min_clusters,
1937                           &bg_bit_off,
1938                           num_clusters,
1939                           &bg_blkno);
1940        if (!status) {
1941            *cluster_start =
1942                ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1943                                 bg_blkno,
1944                                 bg_bit_off);
1945            atomic_inc(&osb->alloc_stats.bitmap_data);
1946        }
1947    }
1948    if (status < 0) {
1949        if (status != -ENOSPC)
1950            mlog_errno(status);
1951        goto bail;
1952    }
1953
1954    ac->ac_bits_given += *num_clusters;
1955
1956bail:
1957    mlog_exit(status);
1958    return status;
1959}
1960
1961int ocfs2_claim_clusters(struct ocfs2_super *osb,
1962             handle_t *handle,
1963             struct ocfs2_alloc_context *ac,
1964             u32 min_clusters,
1965             u32 *cluster_start,
1966             u32 *num_clusters)
1967{
1968    unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1969
1970    return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1971                      bits_wanted, cluster_start, num_clusters);
1972}
1973
1974static int ocfs2_block_group_clear_bits(handle_t *handle,
1975                    struct inode *alloc_inode,
1976                    struct ocfs2_group_desc *bg,
1977                    struct buffer_head *group_bh,
1978                    unsigned int bit_off,
1979                    unsigned int num_bits,
1980                    void (*undo_fn)(unsigned int bit,
1981                            unsigned long *bmap))
1982{
1983    int status;
1984    unsigned int tmp;
1985    struct ocfs2_group_desc *undo_bg = NULL;
1986
1987    mlog_entry_void();
1988
1989    /* The caller got this descriptor from
1990     * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1991    BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1992
1993    mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1994
1995    BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
1996    status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1997                     group_bh,
1998                     undo_fn ?
1999                     OCFS2_JOURNAL_ACCESS_UNDO :
2000                     OCFS2_JOURNAL_ACCESS_WRITE);
2001    if (status < 0) {
2002        mlog_errno(status);
2003        goto bail;
2004    }
2005
2006    if (undo_fn) {
2007        jbd_lock_bh_state(group_bh);
2008        undo_bg = (struct ocfs2_group_desc *)
2009                    bh2jh(group_bh)->b_committed_data;
2010        BUG_ON(!undo_bg);
2011    }
2012
2013    tmp = num_bits;
2014    while(tmp--) {
2015        ocfs2_clear_bit((bit_off + tmp),
2016                (unsigned long *) bg->bg_bitmap);
2017        if (undo_fn)
2018            undo_fn(bit_off + tmp,
2019                (unsigned long *) undo_bg->bg_bitmap);
2020    }
2021    le16_add_cpu(&bg->bg_free_bits_count, num_bits);
2022
2023    if (undo_fn)
2024        jbd_unlock_bh_state(group_bh);
2025
2026    status = ocfs2_journal_dirty(handle, group_bh);
2027    if (status < 0)
2028        mlog_errno(status);
2029bail:
2030    return status;
2031}
2032
2033/*
2034 * expects the suballoc inode to already be locked.
2035 */
2036static int _ocfs2_free_suballoc_bits(handle_t *handle,
2037                     struct inode *alloc_inode,
2038                     struct buffer_head *alloc_bh,
2039                     unsigned int start_bit,
2040                     u64 bg_blkno,
2041                     unsigned int count,
2042                     void (*undo_fn)(unsigned int bit,
2043                             unsigned long *bitmap))
2044{
2045    int status = 0;
2046    u32 tmp_used;
2047    struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
2048    struct ocfs2_chain_list *cl = &fe->id2.i_chain;
2049    struct buffer_head *group_bh = NULL;
2050    struct ocfs2_group_desc *group;
2051
2052    mlog_entry_void();
2053
2054    /* The alloc_bh comes from ocfs2_free_dinode() or
2055     * ocfs2_free_clusters(). The callers have all locked the
2056     * allocator and gotten alloc_bh from the lock call. This
2057     * validates the dinode buffer. Any corruption that has happended
2058     * is a code bug. */
2059    BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2060    BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2061
2062    mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
2063         (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
2064         (unsigned long long)bg_blkno, start_bit);
2065
2066    status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2067                         &group_bh);
2068    if (status < 0) {
2069        mlog_errno(status);
2070        goto bail;
2071    }
2072    group = (struct ocfs2_group_desc *) group_bh->b_data;
2073
2074    BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
2075
2076    status = ocfs2_block_group_clear_bits(handle, alloc_inode,
2077                          group, group_bh,
2078                          start_bit, count, undo_fn);
2079    if (status < 0) {
2080        mlog_errno(status);
2081        goto bail;
2082    }
2083
2084    status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
2085                     alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2086    if (status < 0) {
2087        mlog_errno(status);
2088        goto bail;
2089    }
2090
2091    le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
2092             count);
2093    tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
2094    fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
2095
2096    status = ocfs2_journal_dirty(handle, alloc_bh);
2097    if (status < 0) {
2098        mlog_errno(status);
2099        goto bail;
2100    }
2101
2102bail:
2103    brelse(group_bh);
2104
2105    mlog_exit(status);
2106    return status;
2107}
2108
2109int ocfs2_free_suballoc_bits(handle_t *handle,
2110                 struct inode *alloc_inode,
2111                 struct buffer_head *alloc_bh,
2112                 unsigned int start_bit,
2113                 u64 bg_blkno,
2114                 unsigned int count)
2115{
2116    return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh,
2117                     start_bit, bg_blkno, count, NULL);
2118}
2119
2120int ocfs2_free_dinode(handle_t *handle,
2121              struct inode *inode_alloc_inode,
2122              struct buffer_head *inode_alloc_bh,
2123              struct ocfs2_dinode *di)
2124{
2125    u64 blk = le64_to_cpu(di->i_blkno);
2126    u16 bit = le16_to_cpu(di->i_suballoc_bit);
2127    u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
2128
2129    return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
2130                    inode_alloc_bh, bit, bg_blkno, 1);
2131}
2132
2133static int _ocfs2_free_clusters(handle_t *handle,
2134                struct inode *bitmap_inode,
2135                struct buffer_head *bitmap_bh,
2136                u64 start_blk,
2137                unsigned int num_clusters,
2138                void (*undo_fn)(unsigned int bit,
2139                        unsigned long *bitmap))
2140{
2141    int status;
2142    u16 bg_start_bit;
2143    u64 bg_blkno;
2144    struct ocfs2_dinode *fe;
2145
2146    /* You can't ever have a contiguous set of clusters
2147     * bigger than a block group bitmap so we never have to worry
2148     * about looping on them. */
2149
2150    mlog_entry_void();
2151
2152    /* This is expensive. We can safely remove once this stuff has
2153     * gotten tested really well. */
2154    BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2155
2156    fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
2157
2158    ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2159                     &bg_start_bit);
2160
2161    mlog(0, "want to free %u clusters starting at block %llu\n",
2162         num_clusters, (unsigned long long)start_blk);
2163    mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
2164         (unsigned long long)bg_blkno, bg_start_bit);
2165
2166    status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2167                       bg_start_bit, bg_blkno,
2168                       num_clusters, undo_fn);
2169    if (status < 0) {
2170        mlog_errno(status);
2171        goto out;
2172    }
2173
2174    ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
2175                     num_clusters);
2176
2177out:
2178    mlog_exit(status);
2179    return status;
2180}
2181
2182int ocfs2_free_clusters(handle_t *handle,
2183            struct inode *bitmap_inode,
2184            struct buffer_head *bitmap_bh,
2185            u64 start_blk,
2186            unsigned int num_clusters)
2187{
2188    return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2189                    start_blk, num_clusters,
2190                    _ocfs2_set_bit);
2191}
2192
2193/*
2194 * Give never-used clusters back to the global bitmap. We don't need
2195 * to protect these bits in the undo buffer.
2196 */
2197int ocfs2_release_clusters(handle_t *handle,
2198               struct inode *bitmap_inode,
2199               struct buffer_head *bitmap_bh,
2200               u64 start_blk,
2201               unsigned int num_clusters)
2202{
2203    return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh,
2204                    start_blk, num_clusters,
2205                    _ocfs2_clear_bit);
2206}
2207
2208static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2209{
2210    printk("Block Group:\n");
2211    printk("bg_signature: %s\n", bg->bg_signature);
2212    printk("bg_size: %u\n", bg->bg_size);
2213    printk("bg_bits: %u\n", bg->bg_bits);
2214    printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2215    printk("bg_chain: %u\n", bg->bg_chain);
2216    printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
2217    printk("bg_next_group: %llu\n",
2218           (unsigned long long)bg->bg_next_group);
2219    printk("bg_parent_dinode: %llu\n",
2220           (unsigned long long)bg->bg_parent_dinode);
2221    printk("bg_blkno: %llu\n",
2222           (unsigned long long)bg->bg_blkno);
2223}
2224
2225static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2226{
2227    int i;
2228
2229    printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2230    printk("i_signature: %s\n", fe->i_signature);
2231    printk("i_size: %llu\n",
2232           (unsigned long long)fe->i_size);
2233    printk("i_clusters: %u\n", fe->i_clusters);
2234    printk("i_generation: %u\n",
2235           le32_to_cpu(fe->i_generation));
2236    printk("id1.bitmap1.i_used: %u\n",
2237           le32_to_cpu(fe->id1.bitmap1.i_used));
2238    printk("id1.bitmap1.i_total: %u\n",
2239           le32_to_cpu(fe->id1.bitmap1.i_total));
2240    printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
2241    printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
2242    printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
2243    printk("id2.i_chain.cl_next_free_rec: %u\n",
2244           fe->id2.i_chain.cl_next_free_rec);
2245    for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2246        printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
2247               fe->id2.i_chain.cl_recs[i].c_free);
2248        printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2249               fe->id2.i_chain.cl_recs[i].c_total);
2250        printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2251               (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2252    }
2253}
2254
2255/*
2256 * For a given allocation, determine which allocators will need to be
2257 * accessed, and lock them, reserving the appropriate number of bits.
2258 *
2259 * Sparse file systems call this from ocfs2_write_begin_nolock()
2260 * and ocfs2_allocate_unwritten_extents().
2261 *
2262 * File systems which don't support holes call this from
2263 * ocfs2_extend_allocation().
2264 */
2265int ocfs2_lock_allocators(struct inode *inode,
2266              struct ocfs2_extent_tree *et,
2267              u32 clusters_to_add, u32 extents_to_split,
2268              struct ocfs2_alloc_context **data_ac,
2269              struct ocfs2_alloc_context **meta_ac)
2270{
2271    int ret = 0, num_free_extents;
2272    unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2273    struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2274
2275    *meta_ac = NULL;
2276    if (data_ac)
2277        *data_ac = NULL;
2278
2279    BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2280
2281    num_free_extents = ocfs2_num_free_extents(osb, et);
2282    if (num_free_extents < 0) {
2283        ret = num_free_extents;
2284        mlog_errno(ret);
2285        goto out;
2286    }
2287
2288    /*
2289     * Sparse allocation file systems need to be more conservative
2290     * with reserving room for expansion - the actual allocation
2291     * happens while we've got a journal handle open so re-taking
2292     * a cluster lock (because we ran out of room for another
2293     * extent) will violate ordering rules.
2294     *
2295     * Most of the time we'll only be seeing this 1 cluster at a time
2296     * anyway.
2297     *
2298     * Always lock for any unwritten extents - we might want to
2299     * add blocks during a split.
2300     */
2301    if (!num_free_extents ||
2302        (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2303        ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2304        if (ret < 0) {
2305            if (ret != -ENOSPC)
2306                mlog_errno(ret);
2307            goto out;
2308        }
2309    }
2310
2311    if (clusters_to_add == 0)
2312        goto out;
2313
2314    ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2315    if (ret < 0) {
2316        if (ret != -ENOSPC)
2317            mlog_errno(ret);
2318        goto out;
2319    }
2320
2321out:
2322    if (ret) {
2323        if (*meta_ac) {
2324            ocfs2_free_alloc_context(*meta_ac);
2325            *meta_ac = NULL;
2326        }
2327
2328        /*
2329         * We cannot have an error and a non null *data_ac.
2330         */
2331    }
2332
2333    return ret;
2334}
2335
2336/*
2337 * Read the inode specified by blkno to get suballoc_slot and
2338 * suballoc_bit.
2339 */
2340static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2341                       u16 *suballoc_slot, u16 *suballoc_bit)
2342{
2343    int status;
2344    struct buffer_head *inode_bh = NULL;
2345    struct ocfs2_dinode *inode_fe;
2346
2347    mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
2348
2349    /* dirty read disk */
2350    status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
2351    if (status < 0) {
2352        mlog(ML_ERROR, "read block %llu failed %d\n",
2353             (unsigned long long)blkno, status);
2354        goto bail;
2355    }
2356
2357    inode_fe = (struct ocfs2_dinode *) inode_bh->b_data;
2358    if (!OCFS2_IS_VALID_DINODE(inode_fe)) {
2359        mlog(ML_ERROR, "invalid inode %llu requested\n",
2360             (unsigned long long)blkno);
2361        status = -EINVAL;
2362        goto bail;
2363    }
2364
2365    if (le16_to_cpu(inode_fe->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT &&
2366        (u32)le16_to_cpu(inode_fe->i_suballoc_slot) > osb->max_slots - 1) {
2367        mlog(ML_ERROR, "inode %llu has invalid suballoc slot %u\n",
2368             (unsigned long long)blkno,
2369             (u32)le16_to_cpu(inode_fe->i_suballoc_slot));
2370        status = -EINVAL;
2371        goto bail;
2372    }
2373
2374    if (suballoc_slot)
2375        *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot);
2376    if (suballoc_bit)
2377        *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit);
2378
2379bail:
2380    brelse(inode_bh);
2381
2382    mlog_exit(status);
2383    return status;
2384}
2385
2386/*
2387 * test whether bit is SET in allocator bitmap or not. on success, 0
2388 * is returned and *res is 1 for SET; 0 otherwise. when fails, errno
2389 * is returned and *res is meaningless. Call this after you have
2390 * cluster locked against suballoc, or you may get a result based on
2391 * non-up2date contents
2392 */
2393static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2394                   struct inode *suballoc,
2395                   struct buffer_head *alloc_bh, u64 blkno,
2396                   u16 bit, int *res)
2397{
2398    struct ocfs2_dinode *alloc_fe;
2399    struct ocfs2_group_desc *group;
2400    struct buffer_head *group_bh = NULL;
2401    u64 bg_blkno;
2402    int status;
2403
2404    mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
2405           (unsigned int)bit);
2406
2407    alloc_fe = (struct ocfs2_dinode *)alloc_bh->b_data;
2408    if ((bit + 1) > ocfs2_bits_per_group(&alloc_fe->id2.i_chain)) {
2409        mlog(ML_ERROR, "suballoc bit %u out of range of %u\n",
2410             (unsigned int)bit,
2411             ocfs2_bits_per_group(&alloc_fe->id2.i_chain));
2412        status = -EINVAL;
2413        goto bail;
2414    }
2415
2416    bg_blkno = ocfs2_which_suballoc_group(blkno, bit);
2417    status = ocfs2_read_group_descriptor(suballoc, alloc_fe, bg_blkno,
2418                         &group_bh);
2419    if (status < 0) {
2420        mlog(ML_ERROR, "read group %llu failed %d\n",
2421             (unsigned long long)bg_blkno, status);
2422        goto bail;
2423    }
2424
2425    group = (struct ocfs2_group_desc *) group_bh->b_data;
2426    *res = ocfs2_test_bit(bit, (unsigned long *)group->bg_bitmap);
2427
2428bail:
2429    brelse(group_bh);
2430
2431    mlog_exit(status);
2432    return status;
2433}
2434
2435/*
2436 * Test if the bit representing this inode (blkno) is set in the
2437 * suballocator.
2438 *
2439 * On success, 0 is returned and *res is 1 for SET; 0 otherwise.
2440 *
2441 * In the event of failure, a negative value is returned and *res is
2442 * meaningless.
2443 *
2444 * Callers must make sure to hold nfs_sync_lock to prevent
2445 * ocfs2_delete_inode() on another node from accessing the same
2446 * suballocator concurrently.
2447 */
2448int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2449{
2450    int status;
2451    u16 suballoc_bit = 0, suballoc_slot = 0;
2452    struct inode *inode_alloc_inode;
2453    struct buffer_head *alloc_bh = NULL;
2454
2455    mlog_entry("blkno: %llu", (unsigned long long)blkno);
2456
2457    status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2458                         &suballoc_bit);
2459    if (status < 0) {
2460        mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status);
2461        goto bail;
2462    }
2463
2464    inode_alloc_inode =
2465        ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
2466                        suballoc_slot);
2467    if (!inode_alloc_inode) {
2468        /* the error code could be inaccurate, but we are not able to
2469         * get the correct one. */
2470        status = -EINVAL;
2471        mlog(ML_ERROR, "unable to get alloc inode in slot %u\n",
2472             (u32)suballoc_slot);
2473        goto bail;
2474    }
2475
2476    mutex_lock(&inode_alloc_inode->i_mutex);
2477    status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
2478    if (status < 0) {
2479        mutex_unlock(&inode_alloc_inode->i_mutex);
2480        mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
2481             (u32)suballoc_slot, status);
2482        goto bail;
2483    }
2484
2485    status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh,
2486                     blkno, suballoc_bit, res);
2487    if (status < 0)
2488        mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
2489
2490    ocfs2_inode_unlock(inode_alloc_inode, 0);
2491    mutex_unlock(&inode_alloc_inode->i_mutex);
2492
2493    iput(inode_alloc_inode);
2494    brelse(alloc_bh);
2495bail:
2496    mlog_exit(status);
2497    return status;
2498}
2499

Archive Download this file



interactive