Root/fs/ocfs2/alloc.c

1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.c
5 *
6 * Extent allocs and frees
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/swap.h>
31#include <linux/quotaops.h>
32
33#define MLOG_MASK_PREFIX ML_DISK_ALLOC
34#include <cluster/masklog.h>
35
36#include "ocfs2.h"
37
38#include "alloc.h"
39#include "aops.h"
40#include "blockcheck.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "inode.h"
44#include "journal.h"
45#include "localalloc.h"
46#include "suballoc.h"
47#include "sysfile.h"
48#include "file.h"
49#include "super.h"
50#include "uptodate.h"
51#include "xattr.h"
52#include "refcounttree.h"
53
54#include "buffer_head_io.h"
55
56enum ocfs2_contig_type {
57    CONTIG_NONE = 0,
58    CONTIG_LEFT,
59    CONTIG_RIGHT,
60    CONTIG_LEFTRIGHT,
61};
62
63static enum ocfs2_contig_type
64    ocfs2_extent_rec_contig(struct super_block *sb,
65                struct ocfs2_extent_rec *ext,
66                struct ocfs2_extent_rec *insert_rec);
67/*
68 * Operations for a specific extent tree type.
69 *
70 * To implement an on-disk btree (extent tree) type in ocfs2, add
71 * an ocfs2_extent_tree_operations structure and the matching
72 * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it
73 * for the allocation portion of the extent tree.
74 */
75struct ocfs2_extent_tree_operations {
76    /*
77     * last_eb_blk is the block number of the right most leaf extent
78     * block. Most on-disk structures containing an extent tree store
79     * this value for fast access. The ->eo_set_last_eb_blk() and
80     * ->eo_get_last_eb_blk() operations access this value. They are
81     * both required.
82     */
83    void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
84                   u64 blkno);
85    u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
86
87    /*
88     * The on-disk structure usually keeps track of how many total
89     * clusters are stored in this extent tree. This function updates
90     * that value. new_clusters is the delta, and must be
91     * added to the total. Required.
92     */
93    void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
94                   u32 new_clusters);
95
96    /*
97     * If this extent tree is supported by an extent map, insert
98     * a record into the map.
99     */
100    void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
101                     struct ocfs2_extent_rec *rec);
102
103    /*
104     * If this extent tree is supported by an extent map, truncate the
105     * map to clusters,
106     */
107    void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
108                       u32 clusters);
109
110    /*
111     * If ->eo_insert_check() exists, it is called before rec is
112     * inserted into the extent tree. It is optional.
113     */
114    int (*eo_insert_check)(struct ocfs2_extent_tree *et,
115                   struct ocfs2_extent_rec *rec);
116    int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
117
118    /*
119     * --------------------------------------------------------------
120     * The remaining are internal to ocfs2_extent_tree and don't have
121     * accessor functions
122     */
123
124    /*
125     * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
126     * It is required.
127     */
128    void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
129
130    /*
131     * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
132     * it exists. If it does not, et->et_max_leaf_clusters is set
133     * to 0 (unlimited). Optional.
134     */
135    void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
136
137    /*
138     * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
139     * are contiguous or not. Optional. Don't need to set it if use
140     * ocfs2_extent_rec as the tree leaf.
141     */
142    enum ocfs2_contig_type
143        (*eo_extent_contig)(struct ocfs2_extent_tree *et,
144                    struct ocfs2_extent_rec *ext,
145                    struct ocfs2_extent_rec *insert_rec);
146};
147
148
149/*
150 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
151 * in the methods.
152 */
153static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
154static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
155                     u64 blkno);
156static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
157                     u32 clusters);
158static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
159                       struct ocfs2_extent_rec *rec);
160static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
161                         u32 clusters);
162static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
163                     struct ocfs2_extent_rec *rec);
164static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
165static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
166static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
167    .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
168    .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
169    .eo_update_clusters = ocfs2_dinode_update_clusters,
170    .eo_extent_map_insert = ocfs2_dinode_extent_map_insert,
171    .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
172    .eo_insert_check = ocfs2_dinode_insert_check,
173    .eo_sanity_check = ocfs2_dinode_sanity_check,
174    .eo_fill_root_el = ocfs2_dinode_fill_root_el,
175};
176
177static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
178                     u64 blkno)
179{
180    struct ocfs2_dinode *di = et->et_object;
181
182    BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
183    di->i_last_eb_blk = cpu_to_le64(blkno);
184}
185
186static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
187{
188    struct ocfs2_dinode *di = et->et_object;
189
190    BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
191    return le64_to_cpu(di->i_last_eb_blk);
192}
193
194static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
195                     u32 clusters)
196{
197    struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
198    struct ocfs2_dinode *di = et->et_object;
199
200    le32_add_cpu(&di->i_clusters, clusters);
201    spin_lock(&oi->ip_lock);
202    oi->ip_clusters = le32_to_cpu(di->i_clusters);
203    spin_unlock(&oi->ip_lock);
204}
205
206static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
207                       struct ocfs2_extent_rec *rec)
208{
209    struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
210
211    ocfs2_extent_map_insert_rec(inode, rec);
212}
213
214static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
215                         u32 clusters)
216{
217    struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
218
219    ocfs2_extent_map_trunc(inode, clusters);
220}
221
222static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
223                     struct ocfs2_extent_rec *rec)
224{
225    struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
226    struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
227
228    BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
229    mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
230            (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
231            "Device %s, asking for sparse allocation: inode %llu, "
232            "cpos %u, clusters %u\n",
233            osb->dev_str,
234            (unsigned long long)oi->ip_blkno,
235            rec->e_cpos, oi->ip_clusters);
236
237    return 0;
238}
239
240static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
241{
242    struct ocfs2_dinode *di = et->et_object;
243
244    BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
245    BUG_ON(!OCFS2_IS_VALID_DINODE(di));
246
247    return 0;
248}
249
250static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
251{
252    struct ocfs2_dinode *di = et->et_object;
253
254    et->et_root_el = &di->id2.i_list;
255}
256
257
258static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
259{
260    struct ocfs2_xattr_value_buf *vb = et->et_object;
261
262    et->et_root_el = &vb->vb_xv->xr_list;
263}
264
265static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
266                          u64 blkno)
267{
268    struct ocfs2_xattr_value_buf *vb = et->et_object;
269
270    vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
271}
272
273static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
274{
275    struct ocfs2_xattr_value_buf *vb = et->et_object;
276
277    return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
278}
279
280static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
281                          u32 clusters)
282{
283    struct ocfs2_xattr_value_buf *vb = et->et_object;
284
285    le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
286}
287
288static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
289    .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
290    .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
291    .eo_update_clusters = ocfs2_xattr_value_update_clusters,
292    .eo_fill_root_el = ocfs2_xattr_value_fill_root_el,
293};
294
295static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
296{
297    struct ocfs2_xattr_block *xb = et->et_object;
298
299    et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
300}
301
302static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
303{
304    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
305    et->et_max_leaf_clusters =
306        ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
307}
308
309static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
310                         u64 blkno)
311{
312    struct ocfs2_xattr_block *xb = et->et_object;
313    struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
314
315    xt->xt_last_eb_blk = cpu_to_le64(blkno);
316}
317
318static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
319{
320    struct ocfs2_xattr_block *xb = et->et_object;
321    struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
322
323    return le64_to_cpu(xt->xt_last_eb_blk);
324}
325
326static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
327                         u32 clusters)
328{
329    struct ocfs2_xattr_block *xb = et->et_object;
330
331    le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
332}
333
334static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
335    .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
336    .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
337    .eo_update_clusters = ocfs2_xattr_tree_update_clusters,
338    .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el,
339    .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
340};
341
342static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
343                      u64 blkno)
344{
345    struct ocfs2_dx_root_block *dx_root = et->et_object;
346
347    dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
348}
349
350static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
351{
352    struct ocfs2_dx_root_block *dx_root = et->et_object;
353
354    return le64_to_cpu(dx_root->dr_last_eb_blk);
355}
356
357static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
358                      u32 clusters)
359{
360    struct ocfs2_dx_root_block *dx_root = et->et_object;
361
362    le32_add_cpu(&dx_root->dr_clusters, clusters);
363}
364
365static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
366{
367    struct ocfs2_dx_root_block *dx_root = et->et_object;
368
369    BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
370
371    return 0;
372}
373
374static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
375{
376    struct ocfs2_dx_root_block *dx_root = et->et_object;
377
378    et->et_root_el = &dx_root->dr_list;
379}
380
381static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
382    .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
383    .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
384    .eo_update_clusters = ocfs2_dx_root_update_clusters,
385    .eo_sanity_check = ocfs2_dx_root_sanity_check,
386    .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
387};
388
389static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
390{
391    struct ocfs2_refcount_block *rb = et->et_object;
392
393    et->et_root_el = &rb->rf_list;
394}
395
396static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
397                        u64 blkno)
398{
399    struct ocfs2_refcount_block *rb = et->et_object;
400
401    rb->rf_last_eb_blk = cpu_to_le64(blkno);
402}
403
404static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
405{
406    struct ocfs2_refcount_block *rb = et->et_object;
407
408    return le64_to_cpu(rb->rf_last_eb_blk);
409}
410
411static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
412                        u32 clusters)
413{
414    struct ocfs2_refcount_block *rb = et->et_object;
415
416    le32_add_cpu(&rb->rf_clusters, clusters);
417}
418
419static enum ocfs2_contig_type
420ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
421                  struct ocfs2_extent_rec *ext,
422                  struct ocfs2_extent_rec *insert_rec)
423{
424    return CONTIG_NONE;
425}
426
427static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
428    .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
429    .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
430    .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
431    .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el,
432    .eo_extent_contig = ocfs2_refcount_tree_extent_contig,
433};
434
435static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
436                     struct ocfs2_caching_info *ci,
437                     struct buffer_head *bh,
438                     ocfs2_journal_access_func access,
439                     void *obj,
440                     struct ocfs2_extent_tree_operations *ops)
441{
442    et->et_ops = ops;
443    et->et_root_bh = bh;
444    et->et_ci = ci;
445    et->et_root_journal_access = access;
446    if (!obj)
447        obj = (void *)bh->b_data;
448    et->et_object = obj;
449
450    et->et_ops->eo_fill_root_el(et);
451    if (!et->et_ops->eo_fill_max_leaf_clusters)
452        et->et_max_leaf_clusters = 0;
453    else
454        et->et_ops->eo_fill_max_leaf_clusters(et);
455}
456
457void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
458                   struct ocfs2_caching_info *ci,
459                   struct buffer_head *bh)
460{
461    __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
462                 NULL, &ocfs2_dinode_et_ops);
463}
464
465void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
466                       struct ocfs2_caching_info *ci,
467                       struct buffer_head *bh)
468{
469    __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
470                 NULL, &ocfs2_xattr_tree_et_ops);
471}
472
473void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
474                    struct ocfs2_caching_info *ci,
475                    struct ocfs2_xattr_value_buf *vb)
476{
477    __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
478                 &ocfs2_xattr_value_et_ops);
479}
480
481void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
482                    struct ocfs2_caching_info *ci,
483                    struct buffer_head *bh)
484{
485    __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
486                 NULL, &ocfs2_dx_root_et_ops);
487}
488
489void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
490                     struct ocfs2_caching_info *ci,
491                     struct buffer_head *bh)
492{
493    __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
494                 NULL, &ocfs2_refcount_tree_et_ops);
495}
496
497static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
498                        u64 new_last_eb_blk)
499{
500    et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
501}
502
503static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
504{
505    return et->et_ops->eo_get_last_eb_blk(et);
506}
507
508static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
509                        u32 clusters)
510{
511    et->et_ops->eo_update_clusters(et, clusters);
512}
513
514static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
515                          struct ocfs2_extent_rec *rec)
516{
517    if (et->et_ops->eo_extent_map_insert)
518        et->et_ops->eo_extent_map_insert(et, rec);
519}
520
521static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
522                        u32 clusters)
523{
524    if (et->et_ops->eo_extent_map_truncate)
525        et->et_ops->eo_extent_map_truncate(et, clusters);
526}
527
528static inline int ocfs2_et_root_journal_access(handle_t *handle,
529                           struct ocfs2_extent_tree *et,
530                           int type)
531{
532    return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
533                      type);
534}
535
536static inline enum ocfs2_contig_type
537    ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
538                   struct ocfs2_extent_rec *rec,
539                   struct ocfs2_extent_rec *insert_rec)
540{
541    if (et->et_ops->eo_extent_contig)
542        return et->et_ops->eo_extent_contig(et, rec, insert_rec);
543
544    return ocfs2_extent_rec_contig(
545                ocfs2_metadata_cache_get_super(et->et_ci),
546                rec, insert_rec);
547}
548
549static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
550                    struct ocfs2_extent_rec *rec)
551{
552    int ret = 0;
553
554    if (et->et_ops->eo_insert_check)
555        ret = et->et_ops->eo_insert_check(et, rec);
556    return ret;
557}
558
559static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
560{
561    int ret = 0;
562
563    if (et->et_ops->eo_sanity_check)
564        ret = et->et_ops->eo_sanity_check(et);
565    return ret;
566}
567
568static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
569static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
570                     struct ocfs2_extent_block *eb);
571static void ocfs2_adjust_rightmost_records(handle_t *handle,
572                       struct ocfs2_extent_tree *et,
573                       struct ocfs2_path *path,
574                       struct ocfs2_extent_rec *insert_rec);
575/*
576 * Reset the actual path elements so that we can re-use the structure
577 * to build another path. Generally, this involves freeing the buffer
578 * heads.
579 */
580void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
581{
582    int i, start = 0, depth = 0;
583    struct ocfs2_path_item *node;
584
585    if (keep_root)
586        start = 1;
587
588    for(i = start; i < path_num_items(path); i++) {
589        node = &path->p_node[i];
590
591        brelse(node->bh);
592        node->bh = NULL;
593        node->el = NULL;
594    }
595
596    /*
597     * Tree depth may change during truncate, or insert. If we're
598     * keeping the root extent list, then make sure that our path
599     * structure reflects the proper depth.
600     */
601    if (keep_root)
602        depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
603    else
604        path_root_access(path) = NULL;
605
606    path->p_tree_depth = depth;
607}
608
609void ocfs2_free_path(struct ocfs2_path *path)
610{
611    if (path) {
612        ocfs2_reinit_path(path, 0);
613        kfree(path);
614    }
615}
616
617/*
618 * All the elements of src into dest. After this call, src could be freed
619 * without affecting dest.
620 *
621 * Both paths should have the same root. Any non-root elements of dest
622 * will be freed.
623 */
624static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
625{
626    int i;
627
628    BUG_ON(path_root_bh(dest) != path_root_bh(src));
629    BUG_ON(path_root_el(dest) != path_root_el(src));
630    BUG_ON(path_root_access(dest) != path_root_access(src));
631
632    ocfs2_reinit_path(dest, 1);
633
634    for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
635        dest->p_node[i].bh = src->p_node[i].bh;
636        dest->p_node[i].el = src->p_node[i].el;
637
638        if (dest->p_node[i].bh)
639            get_bh(dest->p_node[i].bh);
640    }
641}
642
643/*
644 * Make the *dest path the same as src and re-initialize src path to
645 * have a root only.
646 */
647static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
648{
649    int i;
650
651    BUG_ON(path_root_bh(dest) != path_root_bh(src));
652    BUG_ON(path_root_access(dest) != path_root_access(src));
653
654    for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
655        brelse(dest->p_node[i].bh);
656
657        dest->p_node[i].bh = src->p_node[i].bh;
658        dest->p_node[i].el = src->p_node[i].el;
659
660        src->p_node[i].bh = NULL;
661        src->p_node[i].el = NULL;
662    }
663}
664
665/*
666 * Insert an extent block at given index.
667 *
668 * This will not take an additional reference on eb_bh.
669 */
670static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
671                    struct buffer_head *eb_bh)
672{
673    struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
674
675    /*
676     * Right now, no root bh is an extent block, so this helps
677     * catch code errors with dinode trees. The assertion can be
678     * safely removed if we ever need to insert extent block
679     * structures at the root.
680     */
681    BUG_ON(index == 0);
682
683    path->p_node[index].bh = eb_bh;
684    path->p_node[index].el = &eb->h_list;
685}
686
687static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
688                     struct ocfs2_extent_list *root_el,
689                     ocfs2_journal_access_func access)
690{
691    struct ocfs2_path *path;
692
693    BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
694
695    path = kzalloc(sizeof(*path), GFP_NOFS);
696    if (path) {
697        path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
698        get_bh(root_bh);
699        path_root_bh(path) = root_bh;
700        path_root_el(path) = root_el;
701        path_root_access(path) = access;
702    }
703
704    return path;
705}
706
707struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
708{
709    return ocfs2_new_path(path_root_bh(path), path_root_el(path),
710                  path_root_access(path));
711}
712
713struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
714{
715    return ocfs2_new_path(et->et_root_bh, et->et_root_el,
716                  et->et_root_journal_access);
717}
718
719/*
720 * Journal the buffer at depth idx. All idx>0 are extent_blocks,
721 * otherwise it's the root_access function.
722 *
723 * I don't like the way this function's name looks next to
724 * ocfs2_journal_access_path(), but I don't have a better one.
725 */
726int ocfs2_path_bh_journal_access(handle_t *handle,
727                 struct ocfs2_caching_info *ci,
728                 struct ocfs2_path *path,
729                 int idx)
730{
731    ocfs2_journal_access_func access = path_root_access(path);
732
733    if (!access)
734        access = ocfs2_journal_access;
735
736    if (idx)
737        access = ocfs2_journal_access_eb;
738
739    return access(handle, ci, path->p_node[idx].bh,
740              OCFS2_JOURNAL_ACCESS_WRITE);
741}
742
743/*
744 * Convenience function to journal all components in a path.
745 */
746int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
747                  handle_t *handle,
748                  struct ocfs2_path *path)
749{
750    int i, ret = 0;
751
752    if (!path)
753        goto out;
754
755    for(i = 0; i < path_num_items(path); i++) {
756        ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
757        if (ret < 0) {
758            mlog_errno(ret);
759            goto out;
760        }
761    }
762
763out:
764    return ret;
765}
766
767/*
768 * Return the index of the extent record which contains cluster #v_cluster.
769 * -1 is returned if it was not found.
770 *
771 * Should work fine on interior and exterior nodes.
772 */
773int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
774{
775    int ret = -1;
776    int i;
777    struct ocfs2_extent_rec *rec;
778    u32 rec_end, rec_start, clusters;
779
780    for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
781        rec = &el->l_recs[i];
782
783        rec_start = le32_to_cpu(rec->e_cpos);
784        clusters = ocfs2_rec_clusters(el, rec);
785
786        rec_end = rec_start + clusters;
787
788        if (v_cluster >= rec_start && v_cluster < rec_end) {
789            ret = i;
790            break;
791        }
792    }
793
794    return ret;
795}
796
797/*
798 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
799 * ocfs2_extent_rec_contig only work properly against leaf nodes!
800 */
801static int ocfs2_block_extent_contig(struct super_block *sb,
802                     struct ocfs2_extent_rec *ext,
803                     u64 blkno)
804{
805    u64 blk_end = le64_to_cpu(ext->e_blkno);
806
807    blk_end += ocfs2_clusters_to_blocks(sb,
808                    le16_to_cpu(ext->e_leaf_clusters));
809
810    return blkno == blk_end;
811}
812
813static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
814                  struct ocfs2_extent_rec *right)
815{
816    u32 left_range;
817
818    left_range = le32_to_cpu(left->e_cpos) +
819        le16_to_cpu(left->e_leaf_clusters);
820
821    return (left_range == le32_to_cpu(right->e_cpos));
822}
823
824static enum ocfs2_contig_type
825    ocfs2_extent_rec_contig(struct super_block *sb,
826                struct ocfs2_extent_rec *ext,
827                struct ocfs2_extent_rec *insert_rec)
828{
829    u64 blkno = le64_to_cpu(insert_rec->e_blkno);
830
831    /*
832     * Refuse to coalesce extent records with different flag
833     * fields - we don't want to mix unwritten extents with user
834     * data.
835     */
836    if (ext->e_flags != insert_rec->e_flags)
837        return CONTIG_NONE;
838
839    if (ocfs2_extents_adjacent(ext, insert_rec) &&
840        ocfs2_block_extent_contig(sb, ext, blkno))
841            return CONTIG_RIGHT;
842
843    blkno = le64_to_cpu(ext->e_blkno);
844    if (ocfs2_extents_adjacent(insert_rec, ext) &&
845        ocfs2_block_extent_contig(sb, insert_rec, blkno))
846        return CONTIG_LEFT;
847
848    return CONTIG_NONE;
849}
850
851/*
852 * NOTE: We can have pretty much any combination of contiguousness and
853 * appending.
854 *
855 * The usefulness of APPEND_TAIL is more in that it lets us know that
856 * we'll have to update the path to that leaf.
857 */
858enum ocfs2_append_type {
859    APPEND_NONE = 0,
860    APPEND_TAIL,
861};
862
863enum ocfs2_split_type {
864    SPLIT_NONE = 0,
865    SPLIT_LEFT,
866    SPLIT_RIGHT,
867};
868
869struct ocfs2_insert_type {
870    enum ocfs2_split_type ins_split;
871    enum ocfs2_append_type ins_appending;
872    enum ocfs2_contig_type ins_contig;
873    int ins_contig_index;
874    int ins_tree_depth;
875};
876
877struct ocfs2_merge_ctxt {
878    enum ocfs2_contig_type c_contig_type;
879    int c_has_empty_extent;
880    int c_split_covers_rec;
881};
882
883static int ocfs2_validate_extent_block(struct super_block *sb,
884                       struct buffer_head *bh)
885{
886    int rc;
887    struct ocfs2_extent_block *eb =
888        (struct ocfs2_extent_block *)bh->b_data;
889
890    mlog(0, "Validating extent block %llu\n",
891         (unsigned long long)bh->b_blocknr);
892
893    BUG_ON(!buffer_uptodate(bh));
894
895    /*
896     * If the ecc fails, we return the error but otherwise
897     * leave the filesystem running. We know any error is
898     * local to this block.
899     */
900    rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
901    if (rc) {
902        mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
903             (unsigned long long)bh->b_blocknr);
904        return rc;
905    }
906
907    /*
908     * Errors after here are fatal.
909     */
910
911    if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
912        ocfs2_error(sb,
913                "Extent block #%llu has bad signature %.*s",
914                (unsigned long long)bh->b_blocknr, 7,
915                eb->h_signature);
916        return -EINVAL;
917    }
918
919    if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
920        ocfs2_error(sb,
921                "Extent block #%llu has an invalid h_blkno "
922                "of %llu",
923                (unsigned long long)bh->b_blocknr,
924                (unsigned long long)le64_to_cpu(eb->h_blkno));
925        return -EINVAL;
926    }
927
928    if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
929        ocfs2_error(sb,
930                "Extent block #%llu has an invalid "
931                "h_fs_generation of #%u",
932                (unsigned long long)bh->b_blocknr,
933                le32_to_cpu(eb->h_fs_generation));
934        return -EINVAL;
935    }
936
937    return 0;
938}
939
940int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
941                struct buffer_head **bh)
942{
943    int rc;
944    struct buffer_head *tmp = *bh;
945
946    rc = ocfs2_read_block(ci, eb_blkno, &tmp,
947                  ocfs2_validate_extent_block);
948
949    /* If ocfs2_read_block() got us a new bh, pass it up. */
950    if (!rc && !*bh)
951        *bh = tmp;
952
953    return rc;
954}
955
956
957/*
958 * How many free extents have we got before we need more meta data?
959 */
960int ocfs2_num_free_extents(struct ocfs2_super *osb,
961               struct ocfs2_extent_tree *et)
962{
963    int retval;
964    struct ocfs2_extent_list *el = NULL;
965    struct ocfs2_extent_block *eb;
966    struct buffer_head *eb_bh = NULL;
967    u64 last_eb_blk = 0;
968
969    mlog_entry_void();
970
971    el = et->et_root_el;
972    last_eb_blk = ocfs2_et_get_last_eb_blk(et);
973
974    if (last_eb_blk) {
975        retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
976                         &eb_bh);
977        if (retval < 0) {
978            mlog_errno(retval);
979            goto bail;
980        }
981        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
982        el = &eb->h_list;
983    }
984
985    BUG_ON(el->l_tree_depth != 0);
986
987    retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
988bail:
989    brelse(eb_bh);
990
991    mlog_exit(retval);
992    return retval;
993}
994
995/* expects array to already be allocated
996 *
997 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
998 * l_count for you
999 */
1000static int ocfs2_create_new_meta_bhs(handle_t *handle,
1001                     struct ocfs2_extent_tree *et,
1002                     int wanted,
1003                     struct ocfs2_alloc_context *meta_ac,
1004                     struct buffer_head *bhs[])
1005{
1006    int count, status, i;
1007    u16 suballoc_bit_start;
1008    u32 num_got;
1009    u64 first_blkno;
1010    struct ocfs2_super *osb =
1011        OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1012    struct ocfs2_extent_block *eb;
1013
1014    mlog_entry_void();
1015
1016    count = 0;
1017    while (count < wanted) {
1018        status = ocfs2_claim_metadata(osb,
1019                          handle,
1020                          meta_ac,
1021                          wanted - count,
1022                          &suballoc_bit_start,
1023                          &num_got,
1024                          &first_blkno);
1025        if (status < 0) {
1026            mlog_errno(status);
1027            goto bail;
1028        }
1029
1030        for(i = count; i < (num_got + count); i++) {
1031            bhs[i] = sb_getblk(osb->sb, first_blkno);
1032            if (bhs[i] == NULL) {
1033                status = -EIO;
1034                mlog_errno(status);
1035                goto bail;
1036            }
1037            ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
1038
1039            status = ocfs2_journal_access_eb(handle, et->et_ci,
1040                             bhs[i],
1041                             OCFS2_JOURNAL_ACCESS_CREATE);
1042            if (status < 0) {
1043                mlog_errno(status);
1044                goto bail;
1045            }
1046
1047            memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
1048            eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
1049            /* Ok, setup the minimal stuff here. */
1050            strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1051            eb->h_blkno = cpu_to_le64(first_blkno);
1052            eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1053            eb->h_suballoc_slot =
1054                cpu_to_le16(meta_ac->ac_alloc_slot);
1055            eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1056            eb->h_list.l_count =
1057                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1058
1059            suballoc_bit_start++;
1060            first_blkno++;
1061
1062            /* We'll also be dirtied by the caller, so
1063             * this isn't absolutely necessary. */
1064            status = ocfs2_journal_dirty(handle, bhs[i]);
1065            if (status < 0) {
1066                mlog_errno(status);
1067                goto bail;
1068            }
1069        }
1070
1071        count += num_got;
1072    }
1073
1074    status = 0;
1075bail:
1076    if (status < 0) {
1077        for(i = 0; i < wanted; i++) {
1078            brelse(bhs[i]);
1079            bhs[i] = NULL;
1080        }
1081    }
1082    mlog_exit(status);
1083    return status;
1084}
1085
1086/*
1087 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1088 *
1089 * Returns the sum of the rightmost extent rec logical offset and
1090 * cluster count.
1091 *
1092 * ocfs2_add_branch() uses this to determine what logical cluster
1093 * value should be populated into the leftmost new branch records.
1094 *
1095 * ocfs2_shift_tree_depth() uses this to determine the # clusters
1096 * value for the new topmost tree record.
1097 */
1098static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
1099{
1100    int i;
1101
1102    i = le16_to_cpu(el->l_next_free_rec) - 1;
1103
1104    return le32_to_cpu(el->l_recs[i].e_cpos) +
1105        ocfs2_rec_clusters(el, &el->l_recs[i]);
1106}
1107
1108/*
1109 * Change range of the branches in the right most path according to the leaf
1110 * extent block's rightmost record.
1111 */
1112static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1113                     struct ocfs2_extent_tree *et)
1114{
1115    int status;
1116    struct ocfs2_path *path = NULL;
1117    struct ocfs2_extent_list *el;
1118    struct ocfs2_extent_rec *rec;
1119
1120    path = ocfs2_new_path_from_et(et);
1121    if (!path) {
1122        status = -ENOMEM;
1123        return status;
1124    }
1125
1126    status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1127    if (status < 0) {
1128        mlog_errno(status);
1129        goto out;
1130    }
1131
1132    status = ocfs2_extend_trans(handle, path_num_items(path) +
1133                    handle->h_buffer_credits);
1134    if (status < 0) {
1135        mlog_errno(status);
1136        goto out;
1137    }
1138
1139    status = ocfs2_journal_access_path(et->et_ci, handle, path);
1140    if (status < 0) {
1141        mlog_errno(status);
1142        goto out;
1143    }
1144
1145    el = path_leaf_el(path);
1146    rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1147
1148    ocfs2_adjust_rightmost_records(handle, et, path, rec);
1149
1150out:
1151    ocfs2_free_path(path);
1152    return status;
1153}
1154
1155/*
1156 * Add an entire tree branch to our inode. eb_bh is the extent block
1157 * to start at, if we don't want to start the branch at the root
1158 * structure.
1159 *
1160 * last_eb_bh is required as we have to update it's next_leaf pointer
1161 * for the new last extent block.
1162 *
1163 * the new branch will be 'empty' in the sense that every block will
1164 * contain a single record with cluster count == 0.
1165 */
1166static int ocfs2_add_branch(handle_t *handle,
1167                struct ocfs2_extent_tree *et,
1168                struct buffer_head *eb_bh,
1169                struct buffer_head **last_eb_bh,
1170                struct ocfs2_alloc_context *meta_ac)
1171{
1172    int status, new_blocks, i;
1173    u64 next_blkno, new_last_eb_blk;
1174    struct buffer_head *bh;
1175    struct buffer_head **new_eb_bhs = NULL;
1176    struct ocfs2_extent_block *eb;
1177    struct ocfs2_extent_list *eb_el;
1178    struct ocfs2_extent_list *el;
1179    u32 new_cpos, root_end;
1180
1181    mlog_entry_void();
1182
1183    BUG_ON(!last_eb_bh || !*last_eb_bh);
1184
1185    if (eb_bh) {
1186        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1187        el = &eb->h_list;
1188    } else
1189        el = et->et_root_el;
1190
1191    /* we never add a branch to a leaf. */
1192    BUG_ON(!el->l_tree_depth);
1193
1194    new_blocks = le16_to_cpu(el->l_tree_depth);
1195
1196    eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1197    new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1198    root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1199
1200    /*
1201     * If there is a gap before the root end and the real end
1202     * of the righmost leaf block, we need to remove the gap
1203     * between new_cpos and root_end first so that the tree
1204     * is consistent after we add a new branch(it will start
1205     * from new_cpos).
1206     */
1207    if (root_end > new_cpos) {
1208        mlog(0, "adjust the cluster end from %u to %u\n",
1209             root_end, new_cpos);
1210        status = ocfs2_adjust_rightmost_branch(handle, et);
1211        if (status) {
1212            mlog_errno(status);
1213            goto bail;
1214        }
1215    }
1216
1217    /* allocate the number of new eb blocks we need */
1218    new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1219                 GFP_KERNEL);
1220    if (!new_eb_bhs) {
1221        status = -ENOMEM;
1222        mlog_errno(status);
1223        goto bail;
1224    }
1225
1226    status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1227                       meta_ac, new_eb_bhs);
1228    if (status < 0) {
1229        mlog_errno(status);
1230        goto bail;
1231    }
1232
1233    /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1234     * linked with the rest of the tree.
1235     * conversly, new_eb_bhs[0] is the new bottommost leaf.
1236     *
1237     * when we leave the loop, new_last_eb_blk will point to the
1238     * newest leaf, and next_blkno will point to the topmost extent
1239     * block. */
1240    next_blkno = new_last_eb_blk = 0;
1241    for(i = 0; i < new_blocks; i++) {
1242        bh = new_eb_bhs[i];
1243        eb = (struct ocfs2_extent_block *) bh->b_data;
1244        /* ocfs2_create_new_meta_bhs() should create it right! */
1245        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1246        eb_el = &eb->h_list;
1247
1248        status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1249                         OCFS2_JOURNAL_ACCESS_CREATE);
1250        if (status < 0) {
1251            mlog_errno(status);
1252            goto bail;
1253        }
1254
1255        eb->h_next_leaf_blk = 0;
1256        eb_el->l_tree_depth = cpu_to_le16(i);
1257        eb_el->l_next_free_rec = cpu_to_le16(1);
1258        /*
1259         * This actually counts as an empty extent as
1260         * c_clusters == 0
1261         */
1262        eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1263        eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1264        /*
1265         * eb_el isn't always an interior node, but even leaf
1266         * nodes want a zero'd flags and reserved field so
1267         * this gets the whole 32 bits regardless of use.
1268         */
1269        eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1270        if (!eb_el->l_tree_depth)
1271            new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1272
1273        status = ocfs2_journal_dirty(handle, bh);
1274        if (status < 0) {
1275            mlog_errno(status);
1276            goto bail;
1277        }
1278
1279        next_blkno = le64_to_cpu(eb->h_blkno);
1280    }
1281
1282    /* This is a bit hairy. We want to update up to three blocks
1283     * here without leaving any of them in an inconsistent state
1284     * in case of error. We don't have to worry about
1285     * journal_dirty erroring as it won't unless we've aborted the
1286     * handle (in which case we would never be here) so reserving
1287     * the write with journal_access is all we need to do. */
1288    status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1289                     OCFS2_JOURNAL_ACCESS_WRITE);
1290    if (status < 0) {
1291        mlog_errno(status);
1292        goto bail;
1293    }
1294    status = ocfs2_et_root_journal_access(handle, et,
1295                          OCFS2_JOURNAL_ACCESS_WRITE);
1296    if (status < 0) {
1297        mlog_errno(status);
1298        goto bail;
1299    }
1300    if (eb_bh) {
1301        status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1302                         OCFS2_JOURNAL_ACCESS_WRITE);
1303        if (status < 0) {
1304            mlog_errno(status);
1305            goto bail;
1306        }
1307    }
1308
1309    /* Link the new branch into the rest of the tree (el will
1310     * either be on the root_bh, or the extent block passed in. */
1311    i = le16_to_cpu(el->l_next_free_rec);
1312    el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1313    el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1314    el->l_recs[i].e_int_clusters = 0;
1315    le16_add_cpu(&el->l_next_free_rec, 1);
1316
1317    /* fe needs a new last extent block pointer, as does the
1318     * next_leaf on the previously last-extent-block. */
1319    ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1320
1321    eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1322    eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1323
1324    status = ocfs2_journal_dirty(handle, *last_eb_bh);
1325    if (status < 0)
1326        mlog_errno(status);
1327    status = ocfs2_journal_dirty(handle, et->et_root_bh);
1328    if (status < 0)
1329        mlog_errno(status);
1330    if (eb_bh) {
1331        status = ocfs2_journal_dirty(handle, eb_bh);
1332        if (status < 0)
1333            mlog_errno(status);
1334    }
1335
1336    /*
1337     * Some callers want to track the rightmost leaf so pass it
1338     * back here.
1339     */
1340    brelse(*last_eb_bh);
1341    get_bh(new_eb_bhs[0]);
1342    *last_eb_bh = new_eb_bhs[0];
1343
1344    status = 0;
1345bail:
1346    if (new_eb_bhs) {
1347        for (i = 0; i < new_blocks; i++)
1348            brelse(new_eb_bhs[i]);
1349        kfree(new_eb_bhs);
1350    }
1351
1352    mlog_exit(status);
1353    return status;
1354}
1355
1356/*
1357 * adds another level to the allocation tree.
1358 * returns back the new extent block so you can add a branch to it
1359 * after this call.
1360 */
1361static int ocfs2_shift_tree_depth(handle_t *handle,
1362                  struct ocfs2_extent_tree *et,
1363                  struct ocfs2_alloc_context *meta_ac,
1364                  struct buffer_head **ret_new_eb_bh)
1365{
1366    int status, i;
1367    u32 new_clusters;
1368    struct buffer_head *new_eb_bh = NULL;
1369    struct ocfs2_extent_block *eb;
1370    struct ocfs2_extent_list *root_el;
1371    struct ocfs2_extent_list *eb_el;
1372
1373    mlog_entry_void();
1374
1375    status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1376                       &new_eb_bh);
1377    if (status < 0) {
1378        mlog_errno(status);
1379        goto bail;
1380    }
1381
1382    eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1383    /* ocfs2_create_new_meta_bhs() should create it right! */
1384    BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1385
1386    eb_el = &eb->h_list;
1387    root_el = et->et_root_el;
1388
1389    status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1390                     OCFS2_JOURNAL_ACCESS_CREATE);
1391    if (status < 0) {
1392        mlog_errno(status);
1393        goto bail;
1394    }
1395
1396    /* copy the root extent list data into the new extent block */
1397    eb_el->l_tree_depth = root_el->l_tree_depth;
1398    eb_el->l_next_free_rec = root_el->l_next_free_rec;
1399    for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1400        eb_el->l_recs[i] = root_el->l_recs[i];
1401
1402    status = ocfs2_journal_dirty(handle, new_eb_bh);
1403    if (status < 0) {
1404        mlog_errno(status);
1405        goto bail;
1406    }
1407
1408    status = ocfs2_et_root_journal_access(handle, et,
1409                          OCFS2_JOURNAL_ACCESS_WRITE);
1410    if (status < 0) {
1411        mlog_errno(status);
1412        goto bail;
1413    }
1414
1415    new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1416
1417    /* update root_bh now */
1418    le16_add_cpu(&root_el->l_tree_depth, 1);
1419    root_el->l_recs[0].e_cpos = 0;
1420    root_el->l_recs[0].e_blkno = eb->h_blkno;
1421    root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1422    for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1423        memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1424    root_el->l_next_free_rec = cpu_to_le16(1);
1425
1426    /* If this is our 1st tree depth shift, then last_eb_blk
1427     * becomes the allocated extent block */
1428    if (root_el->l_tree_depth == cpu_to_le16(1))
1429        ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1430
1431    status = ocfs2_journal_dirty(handle, et->et_root_bh);
1432    if (status < 0) {
1433        mlog_errno(status);
1434        goto bail;
1435    }
1436
1437    *ret_new_eb_bh = new_eb_bh;
1438    new_eb_bh = NULL;
1439    status = 0;
1440bail:
1441    brelse(new_eb_bh);
1442
1443    mlog_exit(status);
1444    return status;
1445}
1446
1447/*
1448 * Should only be called when there is no space left in any of the
1449 * leaf nodes. What we want to do is find the lowest tree depth
1450 * non-leaf extent block with room for new records. There are three
1451 * valid results of this search:
1452 *
1453 * 1) a lowest extent block is found, then we pass it back in
1454 * *lowest_eb_bh and return '0'
1455 *
1456 * 2) the search fails to find anything, but the root_el has room. We
1457 * pass NULL back in *lowest_eb_bh, but still return '0'
1458 *
1459 * 3) the search fails to find anything AND the root_el is full, in
1460 * which case we return > 0
1461 *
1462 * return status < 0 indicates an error.
1463 */
1464static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1465                    struct buffer_head **target_bh)
1466{
1467    int status = 0, i;
1468    u64 blkno;
1469    struct ocfs2_extent_block *eb;
1470    struct ocfs2_extent_list *el;
1471    struct buffer_head *bh = NULL;
1472    struct buffer_head *lowest_bh = NULL;
1473
1474    mlog_entry_void();
1475
1476    *target_bh = NULL;
1477
1478    el = et->et_root_el;
1479
1480    while(le16_to_cpu(el->l_tree_depth) > 1) {
1481        if (le16_to_cpu(el->l_next_free_rec) == 0) {
1482            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1483                    "Owner %llu has empty "
1484                    "extent list (next_free_rec == 0)",
1485                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1486            status = -EIO;
1487            goto bail;
1488        }
1489        i = le16_to_cpu(el->l_next_free_rec) - 1;
1490        blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1491        if (!blkno) {
1492            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1493                    "Owner %llu has extent "
1494                    "list where extent # %d has no physical "
1495                    "block start",
1496                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1497            status = -EIO;
1498            goto bail;
1499        }
1500
1501        brelse(bh);
1502        bh = NULL;
1503
1504        status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1505        if (status < 0) {
1506            mlog_errno(status);
1507            goto bail;
1508        }
1509
1510        eb = (struct ocfs2_extent_block *) bh->b_data;
1511        el = &eb->h_list;
1512
1513        if (le16_to_cpu(el->l_next_free_rec) <
1514            le16_to_cpu(el->l_count)) {
1515            brelse(lowest_bh);
1516            lowest_bh = bh;
1517            get_bh(lowest_bh);
1518        }
1519    }
1520
1521    /* If we didn't find one and the fe doesn't have any room,
1522     * then return '1' */
1523    el = et->et_root_el;
1524    if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1525        status = 1;
1526
1527    *target_bh = lowest_bh;
1528bail:
1529    brelse(bh);
1530
1531    mlog_exit(status);
1532    return status;
1533}
1534
1535/*
1536 * Grow a b-tree so that it has more records.
1537 *
1538 * We might shift the tree depth in which case existing paths should
1539 * be considered invalid.
1540 *
1541 * Tree depth after the grow is returned via *final_depth.
1542 *
1543 * *last_eb_bh will be updated by ocfs2_add_branch().
1544 */
1545static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1546               int *final_depth, struct buffer_head **last_eb_bh,
1547               struct ocfs2_alloc_context *meta_ac)
1548{
1549    int ret, shift;
1550    struct ocfs2_extent_list *el = et->et_root_el;
1551    int depth = le16_to_cpu(el->l_tree_depth);
1552    struct buffer_head *bh = NULL;
1553
1554    BUG_ON(meta_ac == NULL);
1555
1556    shift = ocfs2_find_branch_target(et, &bh);
1557    if (shift < 0) {
1558        ret = shift;
1559        mlog_errno(ret);
1560        goto out;
1561    }
1562
1563    /* We traveled all the way to the bottom of the allocation tree
1564     * and didn't find room for any more extents - we need to add
1565     * another tree level */
1566    if (shift) {
1567        BUG_ON(bh);
1568        mlog(0, "need to shift tree depth (current = %d)\n", depth);
1569
1570        /* ocfs2_shift_tree_depth will return us a buffer with
1571         * the new extent block (so we can pass that to
1572         * ocfs2_add_branch). */
1573        ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1574        if (ret < 0) {
1575            mlog_errno(ret);
1576            goto out;
1577        }
1578        depth++;
1579        if (depth == 1) {
1580            /*
1581             * Special case: we have room now if we shifted from
1582             * tree_depth 0, so no more work needs to be done.
1583             *
1584             * We won't be calling add_branch, so pass
1585             * back *last_eb_bh as the new leaf. At depth
1586             * zero, it should always be null so there's
1587             * no reason to brelse.
1588             */
1589            BUG_ON(*last_eb_bh);
1590            get_bh(bh);
1591            *last_eb_bh = bh;
1592            goto out;
1593        }
1594    }
1595
1596    /* call ocfs2_add_branch to add the final part of the tree with
1597     * the new data. */
1598    mlog(0, "add branch. bh = %p\n", bh);
1599    ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1600                   meta_ac);
1601    if (ret < 0) {
1602        mlog_errno(ret);
1603        goto out;
1604    }
1605
1606out:
1607    if (final_depth)
1608        *final_depth = depth;
1609    brelse(bh);
1610    return ret;
1611}
1612
1613/*
1614 * This function will discard the rightmost extent record.
1615 */
1616static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1617{
1618    int next_free = le16_to_cpu(el->l_next_free_rec);
1619    int count = le16_to_cpu(el->l_count);
1620    unsigned int num_bytes;
1621
1622    BUG_ON(!next_free);
1623    /* This will cause us to go off the end of our extent list. */
1624    BUG_ON(next_free >= count);
1625
1626    num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1627
1628    memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1629}
1630
1631static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1632                  struct ocfs2_extent_rec *insert_rec)
1633{
1634    int i, insert_index, next_free, has_empty, num_bytes;
1635    u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1636    struct ocfs2_extent_rec *rec;
1637
1638    next_free = le16_to_cpu(el->l_next_free_rec);
1639    has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1640
1641    BUG_ON(!next_free);
1642
1643    /* The tree code before us didn't allow enough room in the leaf. */
1644    BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1645
1646    /*
1647     * The easiest way to approach this is to just remove the
1648     * empty extent and temporarily decrement next_free.
1649     */
1650    if (has_empty) {
1651        /*
1652         * If next_free was 1 (only an empty extent), this
1653         * loop won't execute, which is fine. We still want
1654         * the decrement above to happen.
1655         */
1656        for(i = 0; i < (next_free - 1); i++)
1657            el->l_recs[i] = el->l_recs[i+1];
1658
1659        next_free--;
1660    }
1661
1662    /*
1663     * Figure out what the new record index should be.
1664     */
1665    for(i = 0; i < next_free; i++) {
1666        rec = &el->l_recs[i];
1667
1668        if (insert_cpos < le32_to_cpu(rec->e_cpos))
1669            break;
1670    }
1671    insert_index = i;
1672
1673    mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1674         insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1675
1676    BUG_ON(insert_index < 0);
1677    BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1678    BUG_ON(insert_index > next_free);
1679
1680    /*
1681     * No need to memmove if we're just adding to the tail.
1682     */
1683    if (insert_index != next_free) {
1684        BUG_ON(next_free >= le16_to_cpu(el->l_count));
1685
1686        num_bytes = next_free - insert_index;
1687        num_bytes *= sizeof(struct ocfs2_extent_rec);
1688        memmove(&el->l_recs[insert_index + 1],
1689            &el->l_recs[insert_index],
1690            num_bytes);
1691    }
1692
1693    /*
1694     * Either we had an empty extent, and need to re-increment or
1695     * there was no empty extent on a non full rightmost leaf node,
1696     * in which case we still need to increment.
1697     */
1698    next_free++;
1699    el->l_next_free_rec = cpu_to_le16(next_free);
1700    /*
1701     * Make sure none of the math above just messed up our tree.
1702     */
1703    BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1704
1705    el->l_recs[insert_index] = *insert_rec;
1706
1707}
1708
1709static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1710{
1711    int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1712
1713    BUG_ON(num_recs == 0);
1714
1715    if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1716        num_recs--;
1717        size = num_recs * sizeof(struct ocfs2_extent_rec);
1718        memmove(&el->l_recs[0], &el->l_recs[1], size);
1719        memset(&el->l_recs[num_recs], 0,
1720               sizeof(struct ocfs2_extent_rec));
1721        el->l_next_free_rec = cpu_to_le16(num_recs);
1722    }
1723}
1724
1725/*
1726 * Create an empty extent record .
1727 *
1728 * l_next_free_rec may be updated.
1729 *
1730 * If an empty extent already exists do nothing.
1731 */
1732static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1733{
1734    int next_free = le16_to_cpu(el->l_next_free_rec);
1735
1736    BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1737
1738    if (next_free == 0)
1739        goto set_and_inc;
1740
1741    if (ocfs2_is_empty_extent(&el->l_recs[0]))
1742        return;
1743
1744    mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1745            "Asked to create an empty extent in a full list:\n"
1746            "count = %u, tree depth = %u",
1747            le16_to_cpu(el->l_count),
1748            le16_to_cpu(el->l_tree_depth));
1749
1750    ocfs2_shift_records_right(el);
1751
1752set_and_inc:
1753    le16_add_cpu(&el->l_next_free_rec, 1);
1754    memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1755}
1756
1757/*
1758 * For a rotation which involves two leaf nodes, the "root node" is
1759 * the lowest level tree node which contains a path to both leafs. This
1760 * resulting set of information can be used to form a complete "subtree"
1761 *
1762 * This function is passed two full paths from the dinode down to a
1763 * pair of adjacent leaves. It's task is to figure out which path
1764 * index contains the subtree root - this can be the root index itself
1765 * in a worst-case rotation.
1766 *
1767 * The array index of the subtree root is passed back.
1768 */
1769int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1770                struct ocfs2_path *left,
1771                struct ocfs2_path *right)
1772{
1773    int i = 0;
1774
1775    /*
1776     * Check that the caller passed in two paths from the same tree.
1777     */
1778    BUG_ON(path_root_bh(left) != path_root_bh(right));
1779
1780    do {
1781        i++;
1782
1783        /*
1784         * The caller didn't pass two adjacent paths.
1785         */
1786        mlog_bug_on_msg(i > left->p_tree_depth,
1787                "Owner %llu, left depth %u, right depth %u\n"
1788                "left leaf blk %llu, right leaf blk %llu\n",
1789                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1790                left->p_tree_depth, right->p_tree_depth,
1791                (unsigned long long)path_leaf_bh(left)->b_blocknr,
1792                (unsigned long long)path_leaf_bh(right)->b_blocknr);
1793    } while (left->p_node[i].bh->b_blocknr ==
1794         right->p_node[i].bh->b_blocknr);
1795
1796    return i - 1;
1797}
1798
1799typedef void (path_insert_t)(void *, struct buffer_head *);
1800
1801/*
1802 * Traverse a btree path in search of cpos, starting at root_el.
1803 *
1804 * This code can be called with a cpos larger than the tree, in which
1805 * case it will return the rightmost path.
1806 */
1807static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1808                 struct ocfs2_extent_list *root_el, u32 cpos,
1809                 path_insert_t *func, void *data)
1810{
1811    int i, ret = 0;
1812    u32 range;
1813    u64 blkno;
1814    struct buffer_head *bh = NULL;
1815    struct ocfs2_extent_block *eb;
1816    struct ocfs2_extent_list *el;
1817    struct ocfs2_extent_rec *rec;
1818
1819    el = root_el;
1820    while (el->l_tree_depth) {
1821        if (le16_to_cpu(el->l_next_free_rec) == 0) {
1822            ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1823                    "Owner %llu has empty extent list at "
1824                    "depth %u\n",
1825                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1826                    le16_to_cpu(el->l_tree_depth));
1827            ret = -EROFS;
1828            goto out;
1829
1830        }
1831
1832        for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1833            rec = &el->l_recs[i];
1834
1835            /*
1836             * In the case that cpos is off the allocation
1837             * tree, this should just wind up returning the
1838             * rightmost record.
1839             */
1840            range = le32_to_cpu(rec->e_cpos) +
1841                ocfs2_rec_clusters(el, rec);
1842            if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1843                break;
1844        }
1845
1846        blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1847        if (blkno == 0) {
1848            ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1849                    "Owner %llu has bad blkno in extent list "
1850                    "at depth %u (index %d)\n",
1851                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1852                    le16_to_cpu(el->l_tree_depth), i);
1853            ret = -EROFS;
1854            goto out;
1855        }
1856
1857        brelse(bh);
1858        bh = NULL;
1859        ret = ocfs2_read_extent_block(ci, blkno, &bh);
1860        if (ret) {
1861            mlog_errno(ret);
1862            goto out;
1863        }
1864
1865        eb = (struct ocfs2_extent_block *) bh->b_data;
1866        el = &eb->h_list;
1867
1868        if (le16_to_cpu(el->l_next_free_rec) >
1869            le16_to_cpu(el->l_count)) {
1870            ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1871                    "Owner %llu has bad count in extent list "
1872                    "at block %llu (next free=%u, count=%u)\n",
1873                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1874                    (unsigned long long)bh->b_blocknr,
1875                    le16_to_cpu(el->l_next_free_rec),
1876                    le16_to_cpu(el->l_count));
1877            ret = -EROFS;
1878            goto out;
1879        }
1880
1881        if (func)
1882            func(data, bh);
1883    }
1884
1885out:
1886    /*
1887     * Catch any trailing bh that the loop didn't handle.
1888     */
1889    brelse(bh);
1890
1891    return ret;
1892}
1893
1894/*
1895 * Given an initialized path (that is, it has a valid root extent
1896 * list), this function will traverse the btree in search of the path
1897 * which would contain cpos.
1898 *
1899 * The path traveled is recorded in the path structure.
1900 *
1901 * Note that this will not do any comparisons on leaf node extent
1902 * records, so it will work fine in the case that we just added a tree
1903 * branch.
1904 */
1905struct find_path_data {
1906    int index;
1907    struct ocfs2_path *path;
1908};
1909static void find_path_ins(void *data, struct buffer_head *bh)
1910{
1911    struct find_path_data *fp = data;
1912
1913    get_bh(bh);
1914    ocfs2_path_insert_eb(fp->path, fp->index, bh);
1915    fp->index++;
1916}
1917int ocfs2_find_path(struct ocfs2_caching_info *ci,
1918            struct ocfs2_path *path, u32 cpos)
1919{
1920    struct find_path_data data;
1921
1922    data.index = 1;
1923    data.path = path;
1924    return __ocfs2_find_path(ci, path_root_el(path), cpos,
1925                 find_path_ins, &data);
1926}
1927
1928static void find_leaf_ins(void *data, struct buffer_head *bh)
1929{
1930    struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1931    struct ocfs2_extent_list *el = &eb->h_list;
1932    struct buffer_head **ret = data;
1933
1934    /* We want to retain only the leaf block. */
1935    if (le16_to_cpu(el->l_tree_depth) == 0) {
1936        get_bh(bh);
1937        *ret = bh;
1938    }
1939}
1940/*
1941 * Find the leaf block in the tree which would contain cpos. No
1942 * checking of the actual leaf is done.
1943 *
1944 * Some paths want to call this instead of allocating a path structure
1945 * and calling ocfs2_find_path().
1946 *
1947 * This function doesn't handle non btree extent lists.
1948 */
1949int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1950            struct ocfs2_extent_list *root_el, u32 cpos,
1951            struct buffer_head **leaf_bh)
1952{
1953    int ret;
1954    struct buffer_head *bh = NULL;
1955
1956    ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1957    if (ret) {
1958        mlog_errno(ret);
1959        goto out;
1960    }
1961
1962    *leaf_bh = bh;
1963out:
1964    return ret;
1965}
1966
1967/*
1968 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1969 *
1970 * Basically, we've moved stuff around at the bottom of the tree and
1971 * we need to fix up the extent records above the changes to reflect
1972 * the new changes.
1973 *
1974 * left_rec: the record on the left.
1975 * left_child_el: is the child list pointed to by left_rec
1976 * right_rec: the record to the right of left_rec
1977 * right_child_el: is the child list pointed to by right_rec
1978 *
1979 * By definition, this only works on interior nodes.
1980 */
1981static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1982                  struct ocfs2_extent_list *left_child_el,
1983                  struct ocfs2_extent_rec *right_rec,
1984                  struct ocfs2_extent_list *right_child_el)
1985{
1986    u32 left_clusters, right_end;
1987
1988    /*
1989     * Interior nodes never have holes. Their cpos is the cpos of
1990     * the leftmost record in their child list. Their cluster
1991     * count covers the full theoretical range of their child list
1992     * - the range between their cpos and the cpos of the record
1993     * immediately to their right.
1994     */
1995    left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1996    if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1997        BUG_ON(right_child_el->l_tree_depth);
1998        BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1999        left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
2000    }
2001    left_clusters -= le32_to_cpu(left_rec->e_cpos);
2002    left_rec->e_int_clusters = cpu_to_le32(left_clusters);
2003
2004    /*
2005     * Calculate the rightmost cluster count boundary before
2006     * moving cpos - we will need to adjust clusters after
2007     * updating e_cpos to keep the same highest cluster count.
2008     */
2009    right_end = le32_to_cpu(right_rec->e_cpos);
2010    right_end += le32_to_cpu(right_rec->e_int_clusters);
2011
2012    right_rec->e_cpos = left_rec->e_cpos;
2013    le32_add_cpu(&right_rec->e_cpos, left_clusters);
2014
2015    right_end -= le32_to_cpu(right_rec->e_cpos);
2016    right_rec->e_int_clusters = cpu_to_le32(right_end);
2017}
2018
2019/*
2020 * Adjust the adjacent root node records involved in a
2021 * rotation. left_el_blkno is passed in as a key so that we can easily
2022 * find it's index in the root list.
2023 */
2024static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
2025                      struct ocfs2_extent_list *left_el,
2026                      struct ocfs2_extent_list *right_el,
2027                      u64 left_el_blkno)
2028{
2029    int i;
2030
2031    BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
2032           le16_to_cpu(left_el->l_tree_depth));
2033
2034    for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
2035        if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
2036            break;
2037    }
2038
2039    /*
2040     * The path walking code should have never returned a root and
2041     * two paths which are not adjacent.
2042     */
2043    BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2044
2045    ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
2046                      &root_el->l_recs[i + 1], right_el);
2047}
2048
2049/*
2050 * We've changed a leaf block (in right_path) and need to reflect that
2051 * change back up the subtree.
2052 *
2053 * This happens in multiple places:
2054 * - When we've moved an extent record from the left path leaf to the right
2055 * path leaf to make room for an empty extent in the left path leaf.
2056 * - When our insert into the right path leaf is at the leftmost edge
2057 * and requires an update of the path immediately to it's left. This
2058 * can occur at the end of some types of rotation and appending inserts.
2059 * - When we've adjusted the last extent record in the left path leaf and the
2060 * 1st extent record in the right path leaf during cross extent block merge.
2061 */
2062static void ocfs2_complete_edge_insert(handle_t *handle,
2063                       struct ocfs2_path *left_path,
2064                       struct ocfs2_path *right_path,
2065                       int subtree_index)
2066{
2067    int ret, i, idx;
2068    struct ocfs2_extent_list *el, *left_el, *right_el;
2069    struct ocfs2_extent_rec *left_rec, *right_rec;
2070    struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2071
2072    /*
2073     * Update the counts and position values within all the
2074     * interior nodes to reflect the leaf rotation we just did.
2075     *
2076     * The root node is handled below the loop.
2077     *
2078     * We begin the loop with right_el and left_el pointing to the
2079     * leaf lists and work our way up.
2080     *
2081     * NOTE: within this loop, left_el and right_el always refer
2082     * to the *child* lists.
2083     */
2084    left_el = path_leaf_el(left_path);
2085    right_el = path_leaf_el(right_path);
2086    for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2087        mlog(0, "Adjust records at index %u\n", i);
2088
2089        /*
2090         * One nice property of knowing that all of these
2091         * nodes are below the root is that we only deal with
2092         * the leftmost right node record and the rightmost
2093         * left node record.
2094         */
2095        el = left_path->p_node[i].el;
2096        idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2097        left_rec = &el->l_recs[idx];
2098
2099        el = right_path->p_node[i].el;
2100        right_rec = &el->l_recs[0];
2101
2102        ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2103                          right_el);
2104
2105        ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2106        if (ret)
2107            mlog_errno(ret);
2108
2109        ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2110        if (ret)
2111            mlog_errno(ret);
2112
2113        /*
2114         * Setup our list pointers now so that the current
2115         * parents become children in the next iteration.
2116         */
2117        left_el = left_path->p_node[i].el;
2118        right_el = right_path->p_node[i].el;
2119    }
2120
2121    /*
2122     * At the root node, adjust the two adjacent records which
2123     * begin our path to the leaves.
2124     */
2125
2126    el = left_path->p_node[subtree_index].el;
2127    left_el = left_path->p_node[subtree_index + 1].el;
2128    right_el = right_path->p_node[subtree_index + 1].el;
2129
2130    ocfs2_adjust_root_records(el, left_el, right_el,
2131                  left_path->p_node[subtree_index + 1].bh->b_blocknr);
2132
2133    root_bh = left_path->p_node[subtree_index].bh;
2134
2135    ret = ocfs2_journal_dirty(handle, root_bh);
2136    if (ret)
2137        mlog_errno(ret);
2138}
2139
2140static int ocfs2_rotate_subtree_right(handle_t *handle,
2141                      struct ocfs2_extent_tree *et,
2142                      struct ocfs2_path *left_path,
2143                      struct ocfs2_path *right_path,
2144                      int subtree_index)
2145{
2146    int ret, i;
2147    struct buffer_head *right_leaf_bh;
2148    struct buffer_head *left_leaf_bh = NULL;
2149    struct buffer_head *root_bh;
2150    struct ocfs2_extent_list *right_el, *left_el;
2151    struct ocfs2_extent_rec move_rec;
2152
2153    left_leaf_bh = path_leaf_bh(left_path);
2154    left_el = path_leaf_el(left_path);
2155
2156    if (left_el->l_next_free_rec != left_el->l_count) {
2157        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2158                "Inode %llu has non-full interior leaf node %llu"
2159                "(next free = %u)",
2160                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2161                (unsigned long long)left_leaf_bh->b_blocknr,
2162                le16_to_cpu(left_el->l_next_free_rec));
2163        return -EROFS;
2164    }
2165
2166    /*
2167     * This extent block may already have an empty record, so we
2168     * return early if so.
2169     */
2170    if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2171        return 0;
2172
2173    root_bh = left_path->p_node[subtree_index].bh;
2174    BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2175
2176    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2177                       subtree_index);
2178    if (ret) {
2179        mlog_errno(ret);
2180        goto out;
2181    }
2182
2183    for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2184        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2185                           right_path, i);
2186        if (ret) {
2187            mlog_errno(ret);
2188            goto out;
2189        }
2190
2191        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2192                           left_path, i);
2193        if (ret) {
2194            mlog_errno(ret);
2195            goto out;
2196        }
2197    }
2198
2199    right_leaf_bh = path_leaf_bh(right_path);
2200    right_el = path_leaf_el(right_path);
2201
2202    /* This is a code error, not a disk corruption. */
2203    mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2204            "because rightmost leaf block %llu is empty\n",
2205            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2206            (unsigned long long)right_leaf_bh->b_blocknr);
2207
2208    ocfs2_create_empty_extent(right_el);
2209
2210    ret = ocfs2_journal_dirty(handle, right_leaf_bh);
2211    if (ret) {
2212        mlog_errno(ret);
2213        goto out;
2214    }
2215
2216    /* Do the copy now. */
2217    i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2218    move_rec = left_el->l_recs[i];
2219    right_el->l_recs[0] = move_rec;
2220
2221    /*
2222     * Clear out the record we just copied and shift everything
2223     * over, leaving an empty extent in the left leaf.
2224     *
2225     * We temporarily subtract from next_free_rec so that the
2226     * shift will lose the tail record (which is now defunct).
2227     */
2228    le16_add_cpu(&left_el->l_next_free_rec, -1);
2229    ocfs2_shift_records_right(left_el);
2230    memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2231    le16_add_cpu(&left_el->l_next_free_rec, 1);
2232
2233    ret = ocfs2_journal_dirty(handle, left_leaf_bh);
2234    if (ret) {
2235        mlog_errno(ret);
2236        goto out;
2237    }
2238
2239    ocfs2_complete_edge_insert(handle, left_path, right_path,
2240                   subtree_index);
2241
2242out:
2243    return ret;
2244}
2245
2246/*
2247 * Given a full path, determine what cpos value would return us a path
2248 * containing the leaf immediately to the left of the current one.
2249 *
2250 * Will return zero if the path passed in is already the leftmost path.
2251 */
2252static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2253                     struct ocfs2_path *path, u32 *cpos)
2254{
2255    int i, j, ret = 0;
2256    u64 blkno;
2257    struct ocfs2_extent_list *el;
2258
2259    BUG_ON(path->p_tree_depth == 0);
2260
2261    *cpos = 0;
2262
2263    blkno = path_leaf_bh(path)->b_blocknr;
2264
2265    /* Start at the tree node just above the leaf and work our way up. */
2266    i = path->p_tree_depth - 1;
2267    while (i >= 0) {
2268        el = path->p_node[i].el;
2269
2270        /*
2271         * Find the extent record just before the one in our
2272         * path.
2273         */
2274        for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2275            if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2276                if (j == 0) {
2277                    if (i == 0) {
2278                        /*
2279                         * We've determined that the
2280                         * path specified is already
2281                         * the leftmost one - return a
2282                         * cpos of zero.
2283                         */
2284                        goto out;
2285                    }
2286                    /*
2287                     * The leftmost record points to our
2288                     * leaf - we need to travel up the
2289                     * tree one level.
2290                     */
2291                    goto next_node;
2292                }
2293
2294                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2295                *cpos = *cpos + ocfs2_rec_clusters(el,
2296                               &el->l_recs[j - 1]);
2297                *cpos = *cpos - 1;
2298                goto out;
2299            }
2300        }
2301
2302        /*
2303         * If we got here, we never found a valid node where
2304         * the tree indicated one should be.
2305         */
2306        ocfs2_error(sb,
2307                "Invalid extent tree at extent block %llu\n",
2308                (unsigned long long)blkno);
2309        ret = -EROFS;
2310        goto out;
2311
2312next_node:
2313        blkno = path->p_node[i].bh->b_blocknr;
2314        i--;
2315    }
2316
2317out:
2318    return ret;
2319}
2320
2321/*
2322 * Extend the transaction by enough credits to complete the rotation,
2323 * and still leave at least the original number of credits allocated
2324 * to this transaction.
2325 */
2326static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2327                       int op_credits,
2328                       struct ocfs2_path *path)
2329{
2330    int ret;
2331    int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2332
2333    if (handle->h_buffer_credits < credits) {
2334        ret = ocfs2_extend_trans(handle,
2335                     credits - handle->h_buffer_credits);
2336        if (ret)
2337            return ret;
2338
2339        if (unlikely(handle->h_buffer_credits < credits))
2340            return ocfs2_extend_trans(handle, credits);
2341    }
2342
2343    return 0;
2344}
2345
2346/*
2347 * Trap the case where we're inserting into the theoretical range past
2348 * the _actual_ left leaf range. Otherwise, we'll rotate a record
2349 * whose cpos is less than ours into the right leaf.
2350 *
2351 * It's only necessary to look at the rightmost record of the left
2352 * leaf because the logic that calls us should ensure that the
2353 * theoretical ranges in the path components above the leaves are
2354 * correct.
2355 */
2356static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2357                         u32 insert_cpos)
2358{
2359    struct ocfs2_extent_list *left_el;
2360    struct ocfs2_extent_rec *rec;
2361    int next_free;
2362
2363    left_el = path_leaf_el(left_path);
2364    next_free = le16_to_cpu(left_el->l_next_free_rec);
2365    rec = &left_el->l_recs[next_free - 1];
2366
2367    if (insert_cpos > le32_to_cpu(rec->e_cpos))
2368        return 1;
2369    return 0;
2370}
2371
2372static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2373{
2374    int next_free = le16_to_cpu(el->l_next_free_rec);
2375    unsigned int range;
2376    struct ocfs2_extent_rec *rec;
2377
2378    if (next_free == 0)
2379        return 0;
2380
2381    rec = &el->l_recs[0];
2382    if (ocfs2_is_empty_extent(rec)) {
2383        /* Empty list. */
2384        if (next_free == 1)
2385            return 0;
2386        rec = &el->l_recs[1];
2387    }
2388
2389    range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2390    if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2391        return 1;
2392    return 0;
2393}
2394
2395/*
2396 * Rotate all the records in a btree right one record, starting at insert_cpos.
2397 *
2398 * The path to the rightmost leaf should be passed in.
2399 *
2400 * The array is assumed to be large enough to hold an entire path (tree depth).
2401 *
2402 * Upon successful return from this function:
2403 *
2404 * - The 'right_path' array will contain a path to the leaf block
2405 * whose range contains e_cpos.
2406 * - That leaf block will have a single empty extent in list index 0.
2407 * - In the case that the rotation requires a post-insert update,
2408 * *ret_left_path will contain a valid path which can be passed to
2409 * ocfs2_insert_path().
2410 */
2411static int ocfs2_rotate_tree_right(handle_t *handle,
2412                   struct ocfs2_extent_tree *et,
2413                   enum ocfs2_split_type split,
2414                   u32 insert_cpos,
2415                   struct ocfs2_path *right_path,
2416                   struct ocfs2_path **ret_left_path)
2417{
2418    int ret, start, orig_credits = handle->h_buffer_credits;
2419    u32 cpos;
2420    struct ocfs2_path *left_path = NULL;
2421    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2422
2423    *ret_left_path = NULL;
2424
2425    left_path = ocfs2_new_path_from_path(right_path);
2426    if (!left_path) {
2427        ret = -ENOMEM;
2428        mlog_errno(ret);
2429        goto out;
2430    }
2431
2432    ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2433    if (ret) {
2434        mlog_errno(ret);
2435        goto out;
2436    }
2437
2438    mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2439
2440    /*
2441     * What we want to do here is:
2442     *
2443     * 1) Start with the rightmost path.
2444     *
2445     * 2) Determine a path to the leaf block directly to the left
2446     * of that leaf.
2447     *
2448     * 3) Determine the 'subtree root' - the lowest level tree node
2449     * which contains a path to both leaves.
2450     *
2451     * 4) Rotate the subtree.
2452     *
2453     * 5) Find the next subtree by considering the left path to be
2454     * the new right path.
2455     *
2456     * The check at the top of this while loop also accepts
2457     * insert_cpos == cpos because cpos is only a _theoretical_
2458     * value to get us the left path - insert_cpos might very well
2459     * be filling that hole.
2460     *
2461     * Stop at a cpos of '0' because we either started at the
2462     * leftmost branch (i.e., a tree with one branch and a
2463     * rotation inside of it), or we've gone as far as we can in
2464     * rotating subtrees.
2465     */
2466    while (cpos && insert_cpos <= cpos) {
2467        mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2468             insert_cpos, cpos);
2469
2470        ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2471        if (ret) {
2472            mlog_errno(ret);
2473            goto out;
2474        }
2475
2476        mlog_bug_on_msg(path_leaf_bh(left_path) ==
2477                path_leaf_bh(right_path),
2478                "Owner %llu: error during insert of %u "
2479                "(left path cpos %u) results in two identical "
2480                "paths ending at %llu\n",
2481                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2482                insert_cpos, cpos,
2483                (unsigned long long)
2484                path_leaf_bh(left_path)->b_blocknr);
2485
2486        if (split == SPLIT_NONE &&
2487            ocfs2_rotate_requires_path_adjustment(left_path,
2488                              insert_cpos)) {
2489
2490            /*
2491             * We've rotated the tree as much as we
2492             * should. The rest is up to
2493             * ocfs2_insert_path() to complete, after the
2494             * record insertion. We indicate this
2495             * situation by returning the left path.
2496             *
2497             * The reason we don't adjust the records here
2498             * before the record insert is that an error
2499             * later might break the rule where a parent
2500             * record e_cpos will reflect the actual
2501             * e_cpos of the 1st nonempty record of the
2502             * child list.
2503             */
2504            *ret_left_path = left_path;
2505            goto out_ret_path;
2506        }
2507
2508        start = ocfs2_find_subtree_root(et, left_path, right_path);
2509
2510        mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2511             start,
2512             (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2513             right_path->p_tree_depth);
2514
2515        ret = ocfs2_extend_rotate_transaction(handle, start,
2516                              orig_credits, right_path);
2517        if (ret) {
2518            mlog_errno(ret);
2519            goto out;
2520        }
2521
2522        ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2523                         right_path, start);
2524        if (ret) {
2525            mlog_errno(ret);
2526            goto out;
2527        }
2528
2529        if (split != SPLIT_NONE &&
2530            ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2531                        insert_cpos)) {
2532            /*
2533             * A rotate moves the rightmost left leaf
2534             * record over to the leftmost right leaf
2535             * slot. If we're doing an extent split
2536             * instead of a real insert, then we have to
2537             * check that the extent to be split wasn't
2538             * just moved over. If it was, then we can
2539             * exit here, passing left_path back -
2540             * ocfs2_split_extent() is smart enough to
2541             * search both leaves.
2542             */
2543            *ret_left_path = left_path;
2544            goto out_ret_path;
2545        }
2546
2547        /*
2548         * There is no need to re-read the next right path
2549         * as we know that it'll be our current left
2550         * path. Optimize by copying values instead.
2551         */
2552        ocfs2_mv_path(right_path, left_path);
2553
2554        ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2555        if (ret) {
2556            mlog_errno(ret);
2557            goto out;
2558        }
2559    }
2560
2561out:
2562    ocfs2_free_path(left_path);
2563
2564out_ret_path:
2565    return ret;
2566}
2567
2568static int ocfs2_update_edge_lengths(handle_t *handle,
2569                     struct ocfs2_extent_tree *et,
2570                     int subtree_index, struct ocfs2_path *path)
2571{
2572    int i, idx, ret;
2573    struct ocfs2_extent_rec *rec;
2574    struct ocfs2_extent_list *el;
2575    struct ocfs2_extent_block *eb;
2576    u32 range;
2577
2578    /*
2579     * In normal tree rotation process, we will never touch the
2580     * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2581     * doesn't reserve the credits for them either.
2582     *
2583     * But we do have a special case here which will update the rightmost
2584     * records for all the bh in the path.
2585     * So we have to allocate extra credits and access them.
2586     */
2587    ret = ocfs2_extend_trans(handle,
2588                 handle->h_buffer_credits + subtree_index);
2589    if (ret) {
2590        mlog_errno(ret);
2591        goto out;
2592    }
2593
2594    ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2595    if (ret) {
2596        mlog_errno(ret);
2597        goto out;
2598    }
2599
2600    /* Path should always be rightmost. */
2601    eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2602    BUG_ON(eb->h_next_leaf_blk != 0ULL);
2603
2604    el = &eb->h_list;
2605    BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2606    idx = le16_to_cpu(el->l_next_free_rec) - 1;
2607    rec = &el->l_recs[idx];
2608    range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2609
2610    for (i = 0; i < path->p_tree_depth; i++) {
2611        el = path->p_node[i].el;
2612        idx = le16_to_cpu(el->l_next_free_rec) - 1;
2613        rec = &el->l_recs[idx];
2614
2615        rec->e_int_clusters = cpu_to_le32(range);
2616        le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2617
2618        ocfs2_journal_dirty(handle, path->p_node[i].bh);
2619    }
2620out:
2621    return ret;
2622}
2623
2624static void ocfs2_unlink_path(handle_t *handle,
2625                  struct ocfs2_extent_tree *et,
2626                  struct ocfs2_cached_dealloc_ctxt *dealloc,
2627                  struct ocfs2_path *path, int unlink_start)
2628{
2629    int ret, i;
2630    struct ocfs2_extent_block *eb;
2631    struct ocfs2_extent_list *el;
2632    struct buffer_head *bh;
2633
2634    for(i = unlink_start; i < path_num_items(path); i++) {
2635        bh = path->p_node[i].bh;
2636
2637        eb = (struct ocfs2_extent_block *)bh->b_data;
2638        /*
2639         * Not all nodes might have had their final count
2640         * decremented by the caller - handle this here.
2641         */
2642        el = &eb->h_list;
2643        if (le16_to_cpu(el->l_next_free_rec) > 1) {
2644            mlog(ML_ERROR,
2645                 "Inode %llu, attempted to remove extent block "
2646                 "%llu with %u records\n",
2647                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2648                 (unsigned long long)le64_to_cpu(eb->h_blkno),
2649                 le16_to_cpu(el->l_next_free_rec));
2650
2651            ocfs2_journal_dirty(handle, bh);
2652            ocfs2_remove_from_cache(et->et_ci, bh);
2653            continue;
2654        }
2655
2656        el->l_next_free_rec = 0;
2657        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2658
2659        ocfs2_journal_dirty(handle, bh);
2660
2661        ret = ocfs2_cache_extent_block_free(dealloc, eb);
2662        if (ret)
2663            mlog_errno(ret);
2664
2665        ocfs2_remove_from_cache(et->et_ci, bh);
2666    }
2667}
2668
2669static void ocfs2_unlink_subtree(handle_t *handle,
2670                 struct ocfs2_extent_tree *et,
2671                 struct ocfs2_path *left_path,
2672                 struct ocfs2_path *right_path,
2673                 int subtree_index,
2674                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2675{
2676    int i;
2677    struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2678    struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2679    struct ocfs2_extent_list *el;
2680    struct ocfs2_extent_block *eb;
2681
2682    el = path_leaf_el(left_path);
2683
2684    eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2685
2686    for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2687        if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2688            break;
2689
2690    BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2691
2692    memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2693    le16_add_cpu(&root_el->l_next_free_rec, -1);
2694
2695    eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2696    eb->h_next_leaf_blk = 0;
2697
2698    ocfs2_journal_dirty(handle, root_bh);
2699    ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2700
2701    ocfs2_unlink_path(handle, et, dealloc, right_path,
2702              subtree_index + 1);
2703}
2704
2705static int ocfs2_rotate_subtree_left(handle_t *handle,
2706                     struct ocfs2_extent_tree *et,
2707                     struct ocfs2_path *left_path,
2708                     struct ocfs2_path *right_path,
2709                     int subtree_index,
2710                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2711                     int *deleted)
2712{
2713    int ret, i, del_right_subtree = 0, right_has_empty = 0;
2714    struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2715    struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2716    struct ocfs2_extent_block *eb;
2717
2718    *deleted = 0;
2719
2720    right_leaf_el = path_leaf_el(right_path);
2721    left_leaf_el = path_leaf_el(left_path);
2722    root_bh = left_path->p_node[subtree_index].bh;
2723    BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2724
2725    if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2726        return 0;
2727
2728    eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2729    if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2730        /*
2731         * It's legal for us to proceed if the right leaf is
2732         * the rightmost one and it has an empty extent. There
2733         * are two cases to handle - whether the leaf will be
2734         * empty after removal or not. If the leaf isn't empty
2735         * then just remove the empty extent up front. The
2736         * next block will handle empty leaves by flagging
2737         * them for unlink.
2738         *
2739         * Non rightmost leaves will throw -EAGAIN and the
2740         * caller can manually move the subtree and retry.
2741         */
2742
2743        if (eb->h_next_leaf_blk != 0ULL)
2744            return -EAGAIN;
2745
2746        if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2747            ret = ocfs2_journal_access_eb(handle, et->et_ci,
2748                              path_leaf_bh(right_path),
2749                              OCFS2_JOURNAL_ACCESS_WRITE);
2750            if (ret) {
2751                mlog_errno(ret);
2752                goto out;
2753            }
2754
2755            ocfs2_remove_empty_extent(right_leaf_el);
2756        } else
2757            right_has_empty = 1;
2758    }
2759
2760    if (eb->h_next_leaf_blk == 0ULL &&
2761        le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2762        /*
2763         * We have to update i_last_eb_blk during the meta
2764         * data delete.
2765         */
2766        ret = ocfs2_et_root_journal_access(handle, et,
2767                           OCFS2_JOURNAL_ACCESS_WRITE);
2768        if (ret) {
2769            mlog_errno(ret);
2770            goto out;
2771        }
2772
2773        del_right_subtree = 1;
2774    }
2775
2776    /*
2777     * Getting here with an empty extent in the right path implies
2778     * that it's the rightmost path and will be deleted.
2779     */
2780    BUG_ON(right_has_empty && !del_right_subtree);
2781
2782    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2783                       subtree_index);
2784    if (ret) {
2785        mlog_errno(ret);
2786        goto out;
2787    }
2788
2789    for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2790        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2791                           right_path, i);
2792        if (ret) {
2793            mlog_errno(ret);
2794            goto out;
2795        }
2796
2797        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2798                           left_path, i);
2799        if (ret) {
2800            mlog_errno(ret);
2801            goto out;
2802        }
2803    }
2804
2805    if (!right_has_empty) {
2806        /*
2807         * Only do this if we're moving a real
2808         * record. Otherwise, the action is delayed until
2809         * after removal of the right path in which case we
2810         * can do a simple shift to remove the empty extent.
2811         */
2812        ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2813        memset(&right_leaf_el->l_recs[0], 0,
2814               sizeof(struct ocfs2_extent_rec));
2815    }
2816    if (eb->h_next_leaf_blk == 0ULL) {
2817        /*
2818         * Move recs over to get rid of empty extent, decrease
2819         * next_free. This is allowed to remove the last
2820         * extent in our leaf (setting l_next_free_rec to
2821         * zero) - the delete code below won't care.
2822         */
2823        ocfs2_remove_empty_extent(right_leaf_el);
2824    }
2825
2826    ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2827    if (ret)
2828        mlog_errno(ret);
2829    ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2830    if (ret)
2831        mlog_errno(ret);
2832
2833    if (del_right_subtree) {
2834        ocfs2_unlink_subtree(handle, et, left_path, right_path,
2835                     subtree_index, dealloc);
2836        ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2837                        left_path);
2838        if (ret) {
2839            mlog_errno(ret);
2840            goto out;
2841        }
2842
2843        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2844        ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2845
2846        /*
2847         * Removal of the extent in the left leaf was skipped
2848         * above so we could delete the right path
2849         * 1st.
2850         */
2851        if (right_has_empty)
2852            ocfs2_remove_empty_extent(left_leaf_el);
2853
2854        ret = ocfs2_journal_dirty(handle, et_root_bh);
2855        if (ret)
2856            mlog_errno(ret);
2857
2858        *deleted = 1;
2859    } else
2860        ocfs2_complete_edge_insert(handle, left_path, right_path,
2861                       subtree_index);
2862
2863out:
2864    return ret;
2865}
2866
2867/*
2868 * Given a full path, determine what cpos value would return us a path
2869 * containing the leaf immediately to the right of the current one.
2870 *
2871 * Will return zero if the path passed in is already the rightmost path.
2872 *
2873 * This looks similar, but is subtly different to
2874 * ocfs2_find_cpos_for_left_leaf().
2875 */
2876int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2877                   struct ocfs2_path *path, u32 *cpos)
2878{
2879    int i, j, ret = 0;
2880    u64 blkno;
2881    struct ocfs2_extent_list *el;
2882
2883    *cpos = 0;
2884
2885    if (path->p_tree_depth == 0)
2886        return 0;
2887
2888    blkno = path_leaf_bh(path)->b_blocknr;
2889
2890    /* Start at the tree node just above the leaf and work our way up. */
2891    i = path->p_tree_depth - 1;
2892    while (i >= 0) {
2893        int next_free;
2894
2895        el = path->p_node[i].el;
2896
2897        /*
2898         * Find the extent record just after the one in our
2899         * path.
2900         */
2901        next_free = le16_to_cpu(el->l_next_free_rec);
2902        for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2903            if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2904                if (j == (next_free - 1)) {
2905                    if (i == 0) {
2906                        /*
2907                         * We've determined that the
2908                         * path specified is already
2909                         * the rightmost one - return a
2910                         * cpos of zero.
2911                         */
2912                        goto out;
2913                    }
2914                    /*
2915                     * The rightmost record points to our
2916                     * leaf - we need to travel up the
2917                     * tree one level.
2918                     */
2919                    goto next_node;
2920                }
2921
2922                *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2923                goto out;
2924            }
2925        }
2926
2927        /*
2928         * If we got here, we never found a valid node where
2929         * the tree indicated one should be.
2930         */
2931        ocfs2_error(sb,
2932                "Invalid extent tree at extent block %llu\n",
2933                (unsigned long long)blkno);
2934        ret = -EROFS;
2935        goto out;
2936
2937next_node:
2938        blkno = path->p_node[i].bh->b_blocknr;
2939        i--;
2940    }
2941
2942out:
2943    return ret;
2944}
2945
2946static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2947                        struct ocfs2_extent_tree *et,
2948                        struct ocfs2_path *path)
2949{
2950    int ret;
2951    struct buffer_head *bh = path_leaf_bh(path);
2952    struct ocfs2_extent_list *el = path_leaf_el(path);
2953
2954    if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2955        return 0;
2956
2957    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2958                       path_num_items(path) - 1);
2959    if (ret) {
2960        mlog_errno(ret);
2961        goto out;
2962    }
2963
2964    ocfs2_remove_empty_extent(el);
2965
2966    ret = ocfs2_journal_dirty(handle, bh);
2967    if (ret)
2968        mlog_errno(ret);
2969
2970out:
2971    return ret;
2972}
2973
2974static int __ocfs2_rotate_tree_left(handle_t *handle,
2975                    struct ocfs2_extent_tree *et,
2976                    int orig_credits,
2977                    struct ocfs2_path *path,
2978                    struct ocfs2_cached_dealloc_ctxt *dealloc,
2979                    struct ocfs2_path **empty_extent_path)
2980{
2981    int ret, subtree_root, deleted;
2982    u32 right_cpos;
2983    struct ocfs2_path *left_path = NULL;
2984    struct ocfs2_path *right_path = NULL;
2985    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2986
2987    BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2988
2989    *empty_extent_path = NULL;
2990
2991    ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2992    if (ret) {
2993        mlog_errno(ret);
2994        goto out;
2995    }
2996
2997    left_path = ocfs2_new_path_from_path(path);
2998    if (!left_path) {
2999        ret = -ENOMEM;
3000        mlog_errno(ret);
3001        goto out;
3002    }
3003
3004    ocfs2_cp_path(left_path, path);
3005
3006    right_path = ocfs2_new_path_from_path(path);
3007    if (!right_path) {
3008        ret = -ENOMEM;
3009        mlog_errno(ret);
3010        goto out;
3011    }
3012
3013    while (right_cpos) {
3014        ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3015        if (ret) {
3016            mlog_errno(ret);
3017            goto out;
3018        }
3019
3020        subtree_root = ocfs2_find_subtree_root(et, left_path,
3021                               right_path);
3022
3023        mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
3024             subtree_root,
3025             (unsigned long long)
3026             right_path->p_node[subtree_root].bh->b_blocknr,
3027             right_path->p_tree_depth);
3028
3029        ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
3030                              orig_credits, left_path);
3031        if (ret) {
3032            mlog_errno(ret);
3033            goto out;
3034        }
3035
3036        /*
3037         * Caller might still want to make changes to the
3038         * tree root, so re-add it to the journal here.
3039         */
3040        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3041                           left_path, 0);
3042        if (ret) {
3043            mlog_errno(ret);
3044            goto out;
3045        }
3046
3047        ret = ocfs2_rotate_subtree_left(handle, et, left_path,
3048                        right_path, subtree_root,
3049                        dealloc, &deleted);
3050        if (ret == -EAGAIN) {
3051            /*
3052             * The rotation has to temporarily stop due to
3053             * the right subtree having an empty
3054             * extent. Pass it back to the caller for a
3055             * fixup.
3056             */
3057            *empty_extent_path = right_path;
3058            right_path = NULL;
3059            goto out;
3060        }
3061        if (ret) {
3062            mlog_errno(ret);
3063            goto out;
3064        }
3065
3066        /*
3067         * The subtree rotate might have removed records on
3068         * the rightmost edge. If so, then rotation is
3069         * complete.
3070         */
3071        if (deleted)
3072            break;
3073
3074        ocfs2_mv_path(left_path, right_path);
3075
3076        ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
3077                             &right_cpos);
3078        if (ret) {
3079            mlog_errno(ret);
3080            goto out;
3081        }
3082    }
3083
3084out:
3085    ocfs2_free_path(right_path);
3086    ocfs2_free_path(left_path);
3087
3088    return ret;
3089}
3090
3091static int ocfs2_remove_rightmost_path(handle_t *handle,
3092                struct ocfs2_extent_tree *et,
3093                struct ocfs2_path *path,
3094                struct ocfs2_cached_dealloc_ctxt *dealloc)
3095{
3096    int ret, subtree_index;
3097    u32 cpos;
3098    struct ocfs2_path *left_path = NULL;
3099    struct ocfs2_extent_block *eb;
3100    struct ocfs2_extent_list *el;
3101
3102
3103    ret = ocfs2_et_sanity_check(et);
3104    if (ret)
3105        goto out;
3106    /*
3107     * There's two ways we handle this depending on
3108     * whether path is the only existing one.
3109     */
3110    ret = ocfs2_extend_rotate_transaction(handle, 0,
3111                          handle->h_buffer_credits,
3112                          path);
3113    if (ret) {
3114        mlog_errno(ret);
3115        goto out;
3116    }
3117
3118    ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3119    if (ret) {
3120        mlog_errno(ret);
3121        goto out;
3122    }
3123
3124    ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3125                        path, &cpos);
3126    if (ret) {
3127        mlog_errno(ret);
3128        goto out;
3129    }
3130
3131    if (cpos) {
3132        /*
3133         * We have a path to the left of this one - it needs
3134         * an update too.
3135         */
3136        left_path = ocfs2_new_path_from_path(path);
3137        if (!left_path) {
3138            ret = -ENOMEM;
3139            mlog_errno(ret);
3140            goto out;
3141        }
3142
3143        ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3144        if (ret) {
3145            mlog_errno(ret);
3146            goto out;
3147        }
3148
3149        ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3150        if (ret) {
3151            mlog_errno(ret);
3152            goto out;
3153        }
3154
3155        subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3156
3157        ocfs2_unlink_subtree(handle, et, left_path, path,
3158                     subtree_index, dealloc);
3159        ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3160                        left_path);
3161        if (ret) {
3162            mlog_errno(ret);
3163            goto out;
3164        }
3165
3166        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3167        ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3168    } else {
3169        /*
3170         * 'path' is also the leftmost path which
3171         * means it must be the only one. This gets
3172         * handled differently because we want to
3173         * revert the root back to having extents
3174         * in-line.
3175         */
3176        ocfs2_unlink_path(handle, et, dealloc, path, 1);
3177
3178        el = et->et_root_el;
3179        el->l_tree_depth = 0;
3180        el->l_next_free_rec = 0;
3181        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3182
3183        ocfs2_et_set_last_eb_blk(et, 0);
3184    }
3185
3186    ocfs2_journal_dirty(handle, path_root_bh(path));
3187
3188out:
3189    ocfs2_free_path(left_path);
3190    return ret;
3191}
3192
3193/*
3194 * Left rotation of btree records.
3195 *
3196 * In many ways, this is (unsurprisingly) the opposite of right
3197 * rotation. We start at some non-rightmost path containing an empty
3198 * extent in the leaf block. The code works its way to the rightmost
3199 * path by rotating records to the left in every subtree.
3200 *
3201 * This is used by any code which reduces the number of extent records
3202 * in a leaf. After removal, an empty record should be placed in the
3203 * leftmost list position.
3204 *
3205 * This won't handle a length update of the rightmost path records if
3206 * the rightmost tree leaf record is removed so the caller is
3207 * responsible for detecting and correcting that.
3208 */
3209static int ocfs2_rotate_tree_left(handle_t *handle,
3210                  struct ocfs2_extent_tree *et,
3211                  struct ocfs2_path *path,
3212                  struct ocfs2_cached_dealloc_ctxt *dealloc)
3213{
3214    int ret, orig_credits = handle->h_buffer_credits;
3215    struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3216    struct ocfs2_extent_block *eb;
3217    struct ocfs2_extent_list *el;
3218
3219    el = path_leaf_el(path);
3220    if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3221        return 0;
3222
3223    if (path->p_tree_depth == 0) {
3224rightmost_no_delete:
3225        /*
3226         * Inline extents. This is trivially handled, so do
3227         * it up front.
3228         */
3229        ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3230        if (ret)
3231            mlog_errno(ret);
3232        goto out;
3233    }
3234
3235    /*
3236     * Handle rightmost branch now. There's several cases:
3237     * 1) simple rotation leaving records in there. That's trivial.
3238     * 2) rotation requiring a branch delete - there's no more
3239     * records left. Two cases of this:
3240     * a) There are branches to the left.
3241     * b) This is also the leftmost (the only) branch.
3242     *
3243     * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
3244     * 2a) we need the left branch so that we can update it with the unlink
3245     * 2b) we need to bring the root back to inline extents.
3246     */
3247
3248    eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3249    el = &eb->h_list;
3250    if (eb->h_next_leaf_blk == 0) {
3251        /*
3252         * This gets a bit tricky if we're going to delete the
3253         * rightmost path. Get the other cases out of the way
3254         * 1st.
3255         */
3256        if (le16_to_cpu(el->l_next_free_rec) > 1)
3257            goto rightmost_no_delete;
3258
3259        if (le16_to_cpu(el->l_next_free_rec) == 0) {
3260            ret = -EIO;
3261            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3262                    "Owner %llu has empty extent block at %llu",
3263                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3264                    (unsigned long long)le64_to_cpu(eb->h_blkno));
3265            goto out;
3266        }
3267
3268        /*
3269         * XXX: The caller can not trust "path" any more after
3270         * this as it will have been deleted. What do we do?
3271         *
3272         * In theory the rotate-for-merge code will never get
3273         * here because it'll always ask for a rotate in a
3274         * nonempty list.
3275         */
3276
3277        ret = ocfs2_remove_rightmost_path(handle, et, path,
3278                          dealloc);
3279        if (ret)
3280            mlog_errno(ret);
3281        goto out;
3282    }
3283
3284    /*
3285     * Now we can loop, remembering the path we get from -EAGAIN
3286     * and restarting from there.
3287     */
3288try_rotate:
3289    ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3290                       dealloc, &restart_path);
3291    if (ret && ret != -EAGAIN) {
3292        mlog_errno(ret);
3293        goto out;
3294    }
3295
3296    while (ret == -EAGAIN) {
3297        tmp_path = restart_path;
3298        restart_path = NULL;
3299
3300        ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3301                           tmp_path, dealloc,
3302                           &restart_path);
3303        if (ret && ret != -EAGAIN) {
3304            mlog_errno(ret);
3305            goto out;
3306        }
3307
3308        ocfs2_free_path(tmp_path);
3309        tmp_path = NULL;
3310
3311        if (ret == 0)
3312            goto try_rotate;
3313    }
3314
3315out:
3316    ocfs2_free_path(tmp_path);
3317    ocfs2_free_path(restart_path);
3318    return ret;
3319}
3320
3321static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3322                int index)
3323{
3324    struct ocfs2_extent_rec *rec = &el->l_recs[index];
3325    unsigned int size;
3326
3327    if (rec->e_leaf_clusters == 0) {
3328        /*
3329         * We consumed all of the merged-from record. An empty
3330         * extent cannot exist anywhere but the 1st array
3331         * position, so move things over if the merged-from
3332         * record doesn't occupy that position.
3333         *
3334         * This creates a new empty extent so the caller
3335         * should be smart enough to have removed any existing
3336         * ones.
3337         */
3338        if (index > 0) {
3339            BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3340            size = index * sizeof(struct ocfs2_extent_rec);
3341            memmove(&el->l_recs[1], &el->l_recs[0], size);
3342        }
3343
3344        /*
3345         * Always memset - the caller doesn't check whether it
3346         * created an empty extent, so there could be junk in
3347         * the other fields.
3348         */
3349        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3350    }
3351}
3352
3353static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3354                struct ocfs2_path *left_path,
3355                struct ocfs2_path **ret_right_path)
3356{
3357    int ret;
3358    u32 right_cpos;
3359    struct ocfs2_path *right_path = NULL;
3360    struct ocfs2_extent_list *left_el;
3361
3362    *ret_right_path = NULL;
3363
3364    /* This function shouldn't be called for non-trees. */
3365    BUG_ON(left_path->p_tree_depth == 0);
3366
3367    left_el = path_leaf_el(left_path);
3368    BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3369
3370    ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3371                         left_path, &right_cpos);
3372    if (ret) {
3373        mlog_errno(ret);
3374        goto out;
3375    }
3376
3377    /* This function shouldn't be called for the rightmost leaf. */
3378    BUG_ON(right_cpos == 0);
3379
3380    right_path = ocfs2_new_path_from_path(left_path);
3381    if (!right_path) {
3382        ret = -ENOMEM;
3383        mlog_errno(ret);
3384        goto out;
3385    }
3386
3387    ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3388    if (ret) {
3389        mlog_errno(ret);
3390        goto out;
3391    }
3392
3393    *ret_right_path = right_path;
3394out:
3395    if (ret)
3396        ocfs2_free_path(right_path);
3397    return ret;
3398}
3399
3400/*
3401 * Remove split_rec clusters from the record at index and merge them
3402 * onto the beginning of the record "next" to it.
3403 * For index < l_count - 1, the next means the extent rec at index + 1.
3404 * For index == l_count - 1, the "next" means the 1st extent rec of the
3405 * next extent block.
3406 */
3407static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3408                 handle_t *handle,
3409                 struct ocfs2_extent_tree *et,
3410                 struct ocfs2_extent_rec *split_rec,
3411                 int index)
3412{
3413    int ret, next_free, i;
3414    unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3415    struct ocfs2_extent_rec *left_rec;
3416    struct ocfs2_extent_rec *right_rec;
3417    struct ocfs2_extent_list *right_el;
3418    struct ocfs2_path *right_path = NULL;
3419    int subtree_index = 0;
3420    struct ocfs2_extent_list *el = path_leaf_el(left_path);
3421    struct buffer_head *bh = path_leaf_bh(left_path);
3422    struct buffer_head *root_bh = NULL;
3423
3424    BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3425    left_rec = &el->l_recs[index];
3426
3427    if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3428        le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3429        /* we meet with a cross extent block merge. */
3430        ret = ocfs2_get_right_path(et, left_path, &right_path);
3431        if (ret) {
3432            mlog_errno(ret);
3433            goto out;
3434        }
3435
3436        right_el = path_leaf_el(right_path);
3437        next_free = le16_to_cpu(right_el->l_next_free_rec);
3438        BUG_ON(next_free <= 0);
3439        right_rec = &right_el->l_recs[0];
3440        if (ocfs2_is_empty_extent(right_rec)) {
3441            BUG_ON(next_free <= 1);
3442            right_rec = &right_el->l_recs[1];
3443        }
3444
3445        BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3446               le16_to_cpu(left_rec->e_leaf_clusters) !=
3447               le32_to_cpu(right_rec->e_cpos));
3448
3449        subtree_index = ocfs2_find_subtree_root(et, left_path,
3450                            right_path);
3451
3452        ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3453                              handle->h_buffer_credits,
3454                              right_path);
3455        if (ret) {
3456            mlog_errno(ret);
3457            goto out;
3458        }
3459
3460        root_bh = left_path->p_node[subtree_index].bh;
3461        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3462
3463        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3464                           subtree_index);
3465        if (ret) {
3466            mlog_errno(ret);
3467            goto out;
3468        }
3469
3470        for (i = subtree_index + 1;
3471             i < path_num_items(right_path); i++) {
3472            ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3473                               right_path, i);
3474            if (ret) {
3475                mlog_errno(ret);
3476                goto out;
3477            }
3478
3479            ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3480                               left_path, i);
3481            if (ret) {
3482                mlog_errno(ret);
3483                goto out;
3484            }
3485        }
3486
3487    } else {
3488        BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3489        right_rec = &el->l_recs[index + 1];
3490    }
3491
3492    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3493                       path_num_items(left_path) - 1);
3494    if (ret) {
3495        mlog_errno(ret);
3496        goto out;
3497    }
3498
3499    le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3500
3501    le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3502    le64_add_cpu(&right_rec->e_blkno,
3503             -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3504                           split_clusters));
3505    le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3506
3507    ocfs2_cleanup_merge(el, index);
3508
3509    ret = ocfs2_journal_dirty(handle, bh);
3510    if (ret)
3511        mlog_errno(ret);
3512
3513    if (right_path) {
3514        ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3515        if (ret)
3516            mlog_errno(ret);
3517
3518        ocfs2_complete_edge_insert(handle, left_path, right_path,
3519                       subtree_index);
3520    }
3521out:
3522    if (right_path)
3523        ocfs2_free_path(right_path);
3524    return ret;
3525}
3526
3527static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3528                   struct ocfs2_path *right_path,
3529                   struct ocfs2_path **ret_left_path)
3530{
3531    int ret;
3532    u32 left_cpos;
3533    struct ocfs2_path *left_path = NULL;
3534
3535    *ret_left_path = NULL;
3536
3537    /* This function shouldn't be called for non-trees. */
3538    BUG_ON(right_path->p_tree_depth == 0);
3539
3540    ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3541                        right_path, &left_cpos);
3542    if (ret) {
3543        mlog_errno(ret);
3544        goto out;
3545    }
3546
3547    /* This function shouldn't be called for the leftmost leaf. */
3548    BUG_ON(left_cpos == 0);
3549
3550    left_path = ocfs2_new_path_from_path(right_path);
3551    if (!left_path) {
3552        ret = -ENOMEM;
3553        mlog_errno(ret);
3554        goto out;
3555    }
3556
3557    ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3558    if (ret) {
3559        mlog_errno(ret);
3560        goto out;
3561    }
3562
3563    *ret_left_path = left_path;
3564out:
3565    if (ret)
3566        ocfs2_free_path(left_path);
3567    return ret;
3568}
3569
3570/*
3571 * Remove split_rec clusters from the record at index and merge them
3572 * onto the tail of the record "before" it.
3573 * For index > 0, the "before" means the extent rec at index - 1.
3574 *
3575 * For index == 0, the "before" means the last record of the previous
3576 * extent block. And there is also a situation that we may need to
3577 * remove the rightmost leaf extent block in the right_path and change
3578 * the right path to indicate the new rightmost path.
3579 */
3580static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3581                handle_t *handle,
3582                struct ocfs2_extent_tree *et,
3583                struct ocfs2_extent_rec *split_rec,
3584                struct ocfs2_cached_dealloc_ctxt *dealloc,
3585                int index)
3586{
3587    int ret, i, subtree_index = 0, has_empty_extent = 0;
3588    unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3589    struct ocfs2_extent_rec *left_rec;
3590    struct ocfs2_extent_rec *right_rec;
3591    struct ocfs2_extent_list *el = path_leaf_el(right_path);
3592    struct buffer_head *bh = path_leaf_bh(right_path);
3593    struct buffer_head *root_bh = NULL;
3594    struct ocfs2_path *left_path = NULL;
3595    struct ocfs2_extent_list *left_el;
3596
3597    BUG_ON(index < 0);
3598
3599    right_rec = &el->l_recs[index];
3600    if (index == 0) {
3601        /* we meet with a cross extent block merge. */
3602        ret = ocfs2_get_left_path(et, right_path, &left_path);
3603        if (ret) {
3604            mlog_errno(ret);
3605            goto out;
3606        }
3607
3608        left_el = path_leaf_el(left_path);
3609        BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3610               le16_to_cpu(left_el->l_count));
3611
3612        left_rec = &left_el->l_recs[
3613                le16_to_cpu(left_el->l_next_free_rec) - 1];
3614        BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3615               le16_to_cpu(left_rec->e_leaf_clusters) !=
3616               le32_to_cpu(split_rec->e_cpos));
3617
3618        subtree_index = ocfs2_find_subtree_root(et, left_path,
3619                            right_path);
3620
3621        ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3622                              handle->h_buffer_credits,
3623                              left_path);
3624        if (ret) {
3625            mlog_errno(ret);
3626            goto out;
3627        }
3628
3629        root_bh = left_path->p_node[subtree_index].bh;
3630        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3631
3632        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3633                           subtree_index);
3634        if (ret) {
3635            mlog_errno(ret);
3636            goto out;
3637        }
3638
3639        for (i = subtree_index + 1;
3640             i < path_num_items(right_path); i++) {
3641            ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3642                               right_path, i);
3643            if (ret) {
3644                mlog_errno(ret);
3645                goto out;
3646            }
3647
3648            ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3649                               left_path, i);
3650            if (ret) {
3651                mlog_errno(ret);
3652                goto out;
3653            }
3654        }
3655    } else {
3656        left_rec = &el->l_recs[index - 1];
3657        if (ocfs2_is_empty_extent(&el->l_recs[0]))
3658            has_empty_extent = 1;
3659    }
3660
3661    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3662                       path_num_items(right_path) - 1);
3663    if (ret) {
3664        mlog_errno(ret);
3665        goto out;
3666    }
3667
3668    if (has_empty_extent && index == 1) {
3669        /*
3670         * The easy case - we can just plop the record right in.
3671         */
3672        *left_rec = *split_rec;
3673
3674        has_empty_extent = 0;
3675    } else
3676        le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3677
3678    le32_add_cpu(&right_rec->e_cpos, split_clusters);
3679    le64_add_cpu(&right_rec->e_blkno,
3680             ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3681                          split_clusters));
3682    le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3683
3684    ocfs2_cleanup_merge(el, index);
3685
3686    ret = ocfs2_journal_dirty(handle, bh);
3687    if (ret)
3688        mlog_errno(ret);
3689
3690    if (left_path) {
3691        ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3692        if (ret)
3693            mlog_errno(ret);
3694
3695        /*
3696         * In the situation that the right_rec is empty and the extent
3697         * block is empty also, ocfs2_complete_edge_insert can't handle
3698         * it and we need to delete the right extent block.
3699         */
3700        if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3701            le16_to_cpu(el->l_next_free_rec) == 1) {
3702
3703            ret = ocfs2_remove_rightmost_path(handle, et,
3704                              right_path,
3705                              dealloc);
3706            if (ret) {
3707                mlog_errno(ret);
3708                goto out;
3709            }
3710
3711            /* Now the rightmost extent block has been deleted.
3712             * So we use the new rightmost path.
3713             */
3714            ocfs2_mv_path(right_path, left_path);
3715            left_path = NULL;
3716        } else
3717            ocfs2_complete_edge_insert(handle, left_path,
3718                           right_path, subtree_index);
3719    }
3720out:
3721    if (left_path)
3722        ocfs2_free_path(left_path);
3723    return ret;
3724}
3725
3726static int ocfs2_try_to_merge_extent(handle_t *handle,
3727                     struct ocfs2_extent_tree *et,
3728                     struct ocfs2_path *path,
3729                     int split_index,
3730                     struct ocfs2_extent_rec *split_rec,
3731                     struct ocfs2_cached_dealloc_ctxt *dealloc,
3732                     struct ocfs2_merge_ctxt *ctxt)
3733{
3734    int ret = 0;
3735    struct ocfs2_extent_list *el = path_leaf_el(path);
3736    struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3737
3738    BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3739
3740    if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3741        /*
3742         * The merge code will need to create an empty
3743         * extent to take the place of the newly
3744         * emptied slot. Remove any pre-existing empty
3745         * extents - having more than one in a leaf is
3746         * illegal.
3747         */
3748        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3749        if (ret) {
3750            mlog_errno(ret);
3751            goto out;
3752        }
3753        split_index--;
3754        rec = &el->l_recs[split_index];
3755    }
3756
3757    if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3758        /*
3759         * Left-right contig implies this.
3760         */
3761        BUG_ON(!ctxt->c_split_covers_rec);
3762
3763        /*
3764         * Since the leftright insert always covers the entire
3765         * extent, this call will delete the insert record
3766         * entirely, resulting in an empty extent record added to
3767         * the extent block.
3768         *
3769         * Since the adding of an empty extent shifts
3770         * everything back to the right, there's no need to
3771         * update split_index here.
3772         *
3773         * When the split_index is zero, we need to merge it to the
3774         * prevoius extent block. It is more efficient and easier
3775         * if we do merge_right first and merge_left later.
3776         */
3777        ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3778                        split_index);
3779        if (ret) {
3780            mlog_errno(ret);
3781            goto out;
3782        }
3783
3784        /*
3785         * We can only get this from logic error above.
3786         */
3787        BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3788
3789        /* The merge left us with an empty extent, remove it. */
3790        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3791        if (ret) {
3792            mlog_errno(ret);
3793            goto out;
3794        }
3795
3796        rec = &el->l_recs[split_index];
3797
3798        /*
3799         * Note that we don't pass split_rec here on purpose -
3800         * we've merged it into the rec already.
3801         */
3802        ret = ocfs2_merge_rec_left(path, handle, et, rec,
3803                       dealloc, split_index);
3804
3805        if (ret) {
3806            mlog_errno(ret);
3807            goto out;
3808        }
3809
3810        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3811        /*
3812         * Error from this last rotate is not critical, so
3813         * print but don't bubble it up.
3814         */
3815        if (ret)
3816            mlog_errno(ret);
3817        ret = 0;
3818    } else {
3819        /*
3820         * Merge a record to the left or right.
3821         *
3822         * 'contig_type' is relative to the existing record,
3823         * so for example, if we're "right contig", it's to
3824         * the record on the left (hence the left merge).
3825         */
3826        if (ctxt->c_contig_type == CONTIG_RIGHT) {
3827            ret = ocfs2_merge_rec_left(path, handle, et,
3828                           split_rec, dealloc,
3829                           split_index);
3830            if (ret) {
3831                mlog_errno(ret);
3832                goto out;
3833            }
3834        } else {
3835            ret = ocfs2_merge_rec_right(path, handle,
3836                            et, split_rec,
3837                            split_index);
3838            if (ret) {
3839                mlog_errno(ret);
3840                goto out;
3841            }
3842        }
3843
3844        if (ctxt->c_split_covers_rec) {
3845            /*
3846             * The merge may have left an empty extent in
3847             * our leaf. Try to rotate it away.
3848             */
3849            ret = ocfs2_rotate_tree_left(handle, et, path,
3850                             dealloc);
3851            if (ret)
3852                mlog_errno(ret);
3853            ret = 0;
3854        }
3855    }
3856
3857out:
3858    return ret;
3859}
3860
3861static void ocfs2_subtract_from_rec(struct super_block *sb,
3862                    enum ocfs2_split_type split,
3863                    struct ocfs2_extent_rec *rec,
3864                    struct ocfs2_extent_rec *split_rec)
3865{
3866    u64 len_blocks;
3867
3868    len_blocks = ocfs2_clusters_to_blocks(sb,
3869                le16_to_cpu(split_rec->e_leaf_clusters));
3870
3871    if (split == SPLIT_LEFT) {
3872        /*
3873         * Region is on the left edge of the existing
3874         * record.
3875         */
3876        le32_add_cpu(&rec->e_cpos,
3877                 le16_to_cpu(split_rec->e_leaf_clusters));
3878        le64_add_cpu(&rec->e_blkno, len_blocks);
3879        le16_add_cpu(&rec->e_leaf_clusters,
3880                 -le16_to_cpu(split_rec->e_leaf_clusters));
3881    } else {
3882        /*
3883         * Region is on the right edge of the existing
3884         * record.
3885         */
3886        le16_add_cpu(&rec->e_leaf_clusters,
3887                 -le16_to_cpu(split_rec->e_leaf_clusters));
3888    }
3889}
3890
3891/*
3892 * Do the final bits of extent record insertion at the target leaf
3893 * list. If this leaf is part of an allocation tree, it is assumed
3894 * that the tree above has been prepared.
3895 */
3896static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3897                 struct ocfs2_extent_rec *insert_rec,
3898                 struct ocfs2_extent_list *el,
3899                 struct ocfs2_insert_type *insert)
3900{
3901    int i = insert->ins_contig_index;
3902    unsigned int range;
3903    struct ocfs2_extent_rec *rec;
3904
3905    BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3906
3907    if (insert->ins_split != SPLIT_NONE) {
3908        i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3909        BUG_ON(i == -1);
3910        rec = &el->l_recs[i];
3911        ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3912                    insert->ins_split, rec,
3913                    insert_rec);
3914        goto rotate;
3915    }
3916
3917    /*
3918     * Contiguous insert - either left or right.
3919     */
3920    if (insert->ins_contig != CONTIG_NONE) {
3921        rec = &el->l_recs[i];
3922        if (insert->ins_contig == CONTIG_LEFT) {
3923            rec->e_blkno = insert_rec->e_blkno;
3924            rec->e_cpos = insert_rec->e_cpos;
3925        }
3926        le16_add_cpu(&rec->e_leaf_clusters,
3927                 le16_to_cpu(insert_rec->e_leaf_clusters));
3928        return;
3929    }
3930
3931    /*
3932     * Handle insert into an empty leaf.
3933     */
3934    if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3935        ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3936         ocfs2_is_empty_extent(&el->l_recs[0]))) {
3937        el->l_recs[0] = *insert_rec;
3938        el->l_next_free_rec = cpu_to_le16(1);
3939        return;
3940    }
3941
3942    /*
3943     * Appending insert.
3944     */
3945    if (insert->ins_appending == APPEND_TAIL) {
3946        i = le16_to_cpu(el->l_next_free_rec) - 1;
3947        rec = &el->l_recs[i];
3948        range = le32_to_cpu(rec->e_cpos)
3949            + le16_to_cpu(rec->e_leaf_clusters);
3950        BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3951
3952        mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3953                le16_to_cpu(el->l_count),
3954                "owner %llu, depth %u, count %u, next free %u, "
3955                "rec.cpos %u, rec.clusters %u, "
3956                "insert.cpos %u, insert.clusters %u\n",
3957                ocfs2_metadata_cache_owner(et->et_ci),
3958                le16_to_cpu(el->l_tree_depth),
3959                le16_to_cpu(el->l_count),
3960                le16_to_cpu(el->l_next_free_rec),
3961                le32_to_cpu(el->l_recs[i].e_cpos),
3962                le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3963                le32_to_cpu(insert_rec->e_cpos),
3964                le16_to_cpu(insert_rec->e_leaf_clusters));
3965        i++;
3966        el->l_recs[i] = *insert_rec;
3967        le16_add_cpu(&el->l_next_free_rec, 1);
3968        return;
3969    }
3970
3971rotate:
3972    /*
3973     * Ok, we have to rotate.
3974     *
3975     * At this point, it is safe to assume that inserting into an
3976     * empty leaf and appending to a leaf have both been handled
3977     * above.
3978     *
3979     * This leaf needs to have space, either by the empty 1st
3980     * extent record, or by virtue of an l_next_rec < l_count.
3981     */
3982    ocfs2_rotate_leaf(el, insert_rec);
3983}
3984
3985static void ocfs2_adjust_rightmost_records(handle_t *handle,
3986                       struct ocfs2_extent_tree *et,
3987                       struct ocfs2_path *path,
3988                       struct ocfs2_extent_rec *insert_rec)
3989{
3990    int ret, i, next_free;
3991    struct buffer_head *bh;
3992    struct ocfs2_extent_list *el;
3993    struct ocfs2_extent_rec *rec;
3994
3995    /*
3996     * Update everything except the leaf block.
3997     */
3998    for (i = 0; i < path->p_tree_depth; i++) {
3999        bh = path->p_node[i].bh;
4000        el = path->p_node[i].el;
4001
4002        next_free = le16_to_cpu(el->l_next_free_rec);
4003        if (next_free == 0) {
4004            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
4005                    "Owner %llu has a bad extent list",
4006                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4007            ret = -EIO;
4008            return;
4009        }
4010
4011        rec = &el->l_recs[next_free - 1];
4012
4013        rec->e_int_clusters = insert_rec->e_cpos;
4014        le32_add_cpu(&rec->e_int_clusters,
4015                 le16_to_cpu(insert_rec->e_leaf_clusters));
4016        le32_add_cpu(&rec->e_int_clusters,
4017                 -le32_to_cpu(rec->e_cpos));
4018
4019        ret = ocfs2_journal_dirty(handle, bh);
4020        if (ret)
4021            mlog_errno(ret);
4022
4023    }
4024}
4025
4026static int ocfs2_append_rec_to_path(handle_t *handle,
4027                    struct ocfs2_extent_tree *et,
4028                    struct ocfs2_extent_rec *insert_rec,
4029                    struct ocfs2_path *right_path,
4030                    struct ocfs2_path **ret_left_path)
4031{
4032    int ret, next_free;
4033    struct ocfs2_extent_list *el;
4034    struct ocfs2_path *left_path = NULL;
4035
4036    *ret_left_path = NULL;
4037
4038    /*
4039     * This shouldn't happen for non-trees. The extent rec cluster
4040     * count manipulation below only works for interior nodes.
4041     */
4042    BUG_ON(right_path->p_tree_depth == 0);
4043
4044    /*
4045     * If our appending insert is at the leftmost edge of a leaf,
4046     * then we might need to update the rightmost records of the
4047     * neighboring path.
4048     */
4049    el = path_leaf_el(right_path);
4050    next_free = le16_to_cpu(el->l_next_free_rec);
4051    if (next_free == 0 ||
4052        (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
4053        u32 left_cpos;
4054
4055        ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
4056                            right_path, &left_cpos);
4057        if (ret) {
4058            mlog_errno(ret);
4059            goto out;
4060        }
4061
4062        mlog(0, "Append may need a left path update. cpos: %u, "
4063             "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
4064             left_cpos);
4065
4066        /*
4067         * No need to worry if the append is already in the
4068         * leftmost leaf.
4069         */
4070        if (left_cpos) {
4071            left_path = ocfs2_new_path_from_path(right_path);
4072            if (!left_path) {
4073                ret = -ENOMEM;
4074                mlog_errno(ret);
4075                goto out;
4076            }
4077
4078            ret = ocfs2_find_path(et->et_ci, left_path,
4079                          left_cpos);
4080            if (ret) {
4081                mlog_errno(ret);
4082                goto out;
4083            }
4084
4085            /*
4086             * ocfs2_insert_path() will pass the left_path to the
4087             * journal for us.
4088             */
4089        }
4090    }
4091
4092    ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4093    if (ret) {
4094        mlog_errno(ret);
4095        goto out;
4096    }
4097
4098    ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4099
4100    *ret_left_path = left_path;
4101    ret = 0;
4102out:
4103    if (ret != 0)
4104        ocfs2_free_path(left_path);
4105
4106    return ret;
4107}
4108
4109static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4110                   struct ocfs2_path *left_path,
4111                   struct ocfs2_path *right_path,
4112                   struct ocfs2_extent_rec *split_rec,
4113                   enum ocfs2_split_type split)
4114{
4115    int index;
4116    u32 cpos = le32_to_cpu(split_rec->e_cpos);
4117    struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4118    struct ocfs2_extent_rec *rec, *tmprec;
4119
4120    right_el = path_leaf_el(right_path);
4121    if (left_path)
4122        left_el = path_leaf_el(left_path);
4123
4124    el = right_el;
4125    insert_el = right_el;
4126    index = ocfs2_search_extent_list(el, cpos);
4127    if (index != -1) {
4128        if (index == 0 && left_path) {
4129            BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4130
4131            /*
4132             * This typically means that the record
4133             * started in the left path but moved to the
4134             * right as a result of rotation. We either
4135             * move the existing record to the left, or we
4136             * do the later insert there.
4137             *
4138             * In this case, the left path should always
4139             * exist as the rotate code will have passed
4140             * it back for a post-insert update.
4141             */
4142
4143            if (split == SPLIT_LEFT) {
4144                /*
4145                 * It's a left split. Since we know
4146                 * that the rotate code gave us an
4147                 * empty extent in the left path, we
4148                 * can just do the insert there.
4149                 */
4150                insert_el = left_el;
4151            } else {
4152                /*
4153                 * Right split - we have to move the
4154                 * existing record over to the left
4155                 * leaf. The insert will be into the
4156                 * newly created empty extent in the
4157                 * right leaf.
4158                 */
4159                tmprec = &right_el->l_recs[index];
4160                ocfs2_rotate_leaf(left_el, tmprec);
4161                el = left_el;
4162
4163                memset(tmprec, 0, sizeof(*tmprec));
4164                index = ocfs2_search_extent_list(left_el, cpos);
4165                BUG_ON(index == -1);
4166            }
4167        }
4168    } else {
4169        BUG_ON(!left_path);
4170        BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4171        /*
4172         * Left path is easy - we can just allow the insert to
4173         * happen.
4174         */
4175        el = left_el;
4176        insert_el = left_el;
4177        index = ocfs2_search_extent_list(el, cpos);
4178        BUG_ON(index == -1);
4179    }
4180
4181    rec = &el->l_recs[index];
4182    ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4183                split, rec, split_rec);
4184    ocfs2_rotate_leaf(insert_el, split_rec);
4185}
4186
4187/*
4188 * This function only does inserts on an allocation b-tree. For tree
4189 * depth = 0, ocfs2_insert_at_leaf() is called directly.
4190 *
4191 * right_path is the path we want to do the actual insert
4192 * in. left_path should only be passed in if we need to update that
4193 * portion of the tree after an edge insert.
4194 */
4195static int ocfs2_insert_path(handle_t *handle,
4196                 struct ocfs2_extent_tree *et,
4197                 struct ocfs2_path *left_path,
4198                 struct ocfs2_path *right_path,
4199                 struct ocfs2_extent_rec *insert_rec,
4200                 struct ocfs2_insert_type *insert)
4201{
4202    int ret, subtree_index;
4203    struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4204
4205    if (left_path) {
4206        int credits = handle->h_buffer_credits;
4207
4208        /*
4209         * There's a chance that left_path got passed back to
4210         * us without being accounted for in the
4211         * journal. Extend our transaction here to be sure we
4212         * can change those blocks.
4213         */
4214        credits += left_path->p_tree_depth;
4215
4216        ret = ocfs2_extend_trans(handle, credits);
4217        if (ret < 0) {
4218            mlog_errno(ret);
4219            goto out;
4220        }
4221
4222        ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4223        if (ret < 0) {
4224            mlog_errno(ret);
4225            goto out;
4226        }
4227    }
4228
4229    /*
4230     * Pass both paths to the journal. The majority of inserts
4231     * will be touching all components anyway.
4232     */
4233    ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4234    if (ret < 0) {
4235        mlog_errno(ret);
4236        goto out;
4237    }
4238
4239    if (insert->ins_split != SPLIT_NONE) {
4240        /*
4241         * We could call ocfs2_insert_at_leaf() for some types
4242         * of splits, but it's easier to just let one separate
4243         * function sort it all out.
4244         */
4245        ocfs2_split_record(et, left_path, right_path,
4246                   insert_rec, insert->ins_split);
4247
4248        /*
4249         * Split might have modified either leaf and we don't
4250         * have a guarantee that the later edge insert will
4251         * dirty this for us.
4252         */
4253        if (left_path)
4254            ret = ocfs2_journal_dirty(handle,
4255                          path_leaf_bh(left_path));
4256            if (ret)
4257                mlog_errno(ret);
4258    } else
4259        ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4260                     insert);
4261
4262    ret = ocfs2_journal_dirty(handle, leaf_bh);
4263    if (ret)
4264        mlog_errno(ret);
4265
4266    if (left_path) {
4267        /*
4268         * The rotate code has indicated that we need to fix
4269         * up portions of the tree after the insert.
4270         *
4271         * XXX: Should we extend the transaction here?
4272         */
4273        subtree_index = ocfs2_find_subtree_root(et, left_path,
4274                            right_path);
4275        ocfs2_complete_edge_insert(handle, left_path, right_path,
4276                       subtree_index);
4277    }
4278
4279    ret = 0;
4280out:
4281    return ret;
4282}
4283
4284static int ocfs2_do_insert_extent(handle_t *handle,
4285                  struct ocfs2_extent_tree *et,
4286                  struct ocfs2_extent_rec *insert_rec,
4287                  struct ocfs2_insert_type *type)
4288{
4289    int ret, rotate = 0;
4290    u32 cpos;
4291    struct ocfs2_path *right_path = NULL;
4292    struct ocfs2_path *left_path = NULL;
4293    struct ocfs2_extent_list *el;
4294
4295    el = et->et_root_el;
4296
4297    ret = ocfs2_et_root_journal_access(handle, et,
4298                       OCFS2_JOURNAL_ACCESS_WRITE);
4299    if (ret) {
4300        mlog_errno(ret);
4301        goto out;
4302    }
4303
4304    if (le16_to_cpu(el->l_tree_depth) == 0) {
4305        ocfs2_insert_at_leaf(et, insert_rec, el, type);
4306        goto out_update_clusters;
4307    }
4308
4309    right_path = ocfs2_new_path_from_et(et);
4310    if (!right_path) {
4311        ret = -ENOMEM;
4312        mlog_errno(ret);
4313        goto out;
4314    }
4315
4316    /*
4317     * Determine the path to start with. Rotations need the
4318     * rightmost path, everything else can go directly to the
4319     * target leaf.
4320     */
4321    cpos = le32_to_cpu(insert_rec->e_cpos);
4322    if (type->ins_appending == APPEND_NONE &&
4323        type->ins_contig == CONTIG_NONE) {
4324        rotate = 1;
4325        cpos = UINT_MAX;
4326    }
4327
4328    ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4329    if (ret) {
4330        mlog_errno(ret);
4331        goto out;
4332    }
4333
4334    /*
4335     * Rotations and appends need special treatment - they modify
4336     * parts of the tree's above them.
4337     *
4338     * Both might pass back a path immediate to the left of the
4339     * one being inserted to. This will be cause
4340     * ocfs2_insert_path() to modify the rightmost records of
4341     * left_path to account for an edge insert.
4342     *
4343     * XXX: When modifying this code, keep in mind that an insert
4344     * can wind up skipping both of these two special cases...
4345     */
4346    if (rotate) {
4347        ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4348                          le32_to_cpu(insert_rec->e_cpos),
4349                          right_path, &left_path);
4350        if (ret) {
4351            mlog_errno(ret);
4352            goto out;
4353        }
4354
4355        /*
4356         * ocfs2_rotate_tree_right() might have extended the
4357         * transaction without re-journaling our tree root.
4358         */
4359        ret = ocfs2_et_root_journal_access(handle, et,
4360                           OCFS2_JOURNAL_ACCESS_WRITE);
4361        if (ret) {
4362            mlog_errno(ret);
4363            goto out;
4364        }
4365    } else if (type->ins_appending == APPEND_TAIL
4366           && type->ins_contig != CONTIG_LEFT) {
4367        ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4368                           right_path, &left_path);
4369        if (ret) {
4370            mlog_errno(ret);
4371            goto out;
4372        }
4373    }
4374
4375    ret = ocfs2_insert_path(handle, et, left_path, right_path,
4376                insert_rec, type);
4377    if (ret) {
4378        mlog_errno(ret);
4379        goto out;
4380    }
4381
4382out_update_clusters:
4383    if (type->ins_split == SPLIT_NONE)
4384        ocfs2_et_update_clusters(et,
4385                     le16_to_cpu(insert_rec->e_leaf_clusters));
4386
4387    ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4388    if (ret)
4389        mlog_errno(ret);
4390
4391out:
4392    ocfs2_free_path(left_path);
4393    ocfs2_free_path(right_path);
4394
4395    return ret;
4396}
4397
4398static enum ocfs2_contig_type
4399ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4400                   struct ocfs2_path *path,
4401                   struct ocfs2_extent_list *el, int index,
4402                   struct ocfs2_extent_rec *split_rec)
4403{
4404    int status;
4405    enum ocfs2_contig_type ret = CONTIG_NONE;
4406    u32 left_cpos, right_cpos;
4407    struct ocfs2_extent_rec *rec = NULL;
4408    struct ocfs2_extent_list *new_el;
4409    struct ocfs2_path *left_path = NULL, *right_path = NULL;
4410    struct buffer_head *bh;
4411    struct ocfs2_extent_block *eb;
4412    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4413
4414    if (index > 0) {
4415        rec = &el->l_recs[index - 1];
4416    } else if (path->p_tree_depth > 0) {
4417        status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4418        if (status)
4419            goto out;
4420
4421        if (left_cpos != 0) {
4422            left_path = ocfs2_new_path_from_path(path);
4423            if (!left_path)
4424                goto out;
4425
4426            status = ocfs2_find_path(et->et_ci, left_path,
4427                         left_cpos);
4428            if (status)
4429                goto out;
4430
4431            new_el = path_leaf_el(left_path);
4432
4433            if (le16_to_cpu(new_el->l_next_free_rec) !=
4434                le16_to_cpu(new_el->l_count)) {
4435                bh = path_leaf_bh(left_path);
4436                eb = (struct ocfs2_extent_block *)bh->b_data;
4437                ocfs2_error(sb,
4438                        "Extent block #%llu has an "
4439                        "invalid l_next_free_rec of "
4440                        "%d. It should have "
4441                        "matched the l_count of %d",
4442                        (unsigned long long)le64_to_cpu(eb->h_blkno),
4443                        le16_to_cpu(new_el->l_next_free_rec),
4444                        le16_to_cpu(new_el->l_count));
4445                status = -EINVAL;
4446                goto out;
4447            }
4448            rec = &new_el->l_recs[
4449                le16_to_cpu(new_el->l_next_free_rec) - 1];
4450        }
4451    }
4452
4453    /*
4454     * We're careful to check for an empty extent record here -
4455     * the merge code will know what to do if it sees one.
4456     */
4457    if (rec) {
4458        if (index == 1 && ocfs2_is_empty_extent(rec)) {
4459            if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4460                ret = CONTIG_RIGHT;
4461        } else {
4462            ret = ocfs2_et_extent_contig(et, rec, split_rec);
4463        }
4464    }
4465
4466    rec = NULL;
4467    if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4468        rec = &el->l_recs[index + 1];
4469    else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4470         path->p_tree_depth > 0) {
4471        status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4472        if (status)
4473            goto out;
4474
4475        if (right_cpos == 0)
4476            goto out;
4477
4478        right_path = ocfs2_new_path_from_path(path);
4479        if (!right_path)
4480            goto out;
4481
4482        status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4483        if (status)
4484            goto out;
4485
4486        new_el = path_leaf_el(right_path);
4487        rec = &new_el->l_recs[0];
4488        if (ocfs2_is_empty_extent(rec)) {
4489            if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4490                bh = path_leaf_bh(right_path);
4491                eb = (struct ocfs2_extent_block *)bh->b_data;
4492                ocfs2_error(sb,
4493                        "Extent block #%llu has an "
4494                        "invalid l_next_free_rec of %d",
4495                        (unsigned long long)le64_to_cpu(eb->h_blkno),
4496                        le16_to_cpu(new_el->l_next_free_rec));
4497                status = -EINVAL;
4498                goto out;
4499            }
4500            rec = &new_el->l_recs[1];
4501        }
4502    }
4503
4504    if (rec) {
4505        enum ocfs2_contig_type contig_type;
4506
4507        contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4508
4509        if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4510            ret = CONTIG_LEFTRIGHT;
4511        else if (ret == CONTIG_NONE)
4512            ret = contig_type;
4513    }
4514
4515out:
4516    if (left_path)
4517        ocfs2_free_path(left_path);
4518    if (right_path)
4519        ocfs2_free_path(right_path);
4520
4521    return ret;
4522}
4523
4524static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4525                     struct ocfs2_insert_type *insert,
4526                     struct ocfs2_extent_list *el,
4527                     struct ocfs2_extent_rec *insert_rec)
4528{
4529    int i;
4530    enum ocfs2_contig_type contig_type = CONTIG_NONE;
4531
4532    BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4533
4534    for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4535        contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4536                             insert_rec);
4537        if (contig_type != CONTIG_NONE) {
4538            insert->ins_contig_index = i;
4539            break;
4540        }
4541    }
4542    insert->ins_contig = contig_type;
4543
4544    if (insert->ins_contig != CONTIG_NONE) {
4545        struct ocfs2_extent_rec *rec =
4546                &el->l_recs[insert->ins_contig_index];
4547        unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4548                   le16_to_cpu(insert_rec->e_leaf_clusters);
4549
4550        /*
4551         * Caller might want us to limit the size of extents, don't
4552         * calculate contiguousness if we might exceed that limit.
4553         */
4554        if (et->et_max_leaf_clusters &&
4555            (len > et->et_max_leaf_clusters))
4556            insert->ins_contig = CONTIG_NONE;
4557    }
4558}
4559
4560/*
4561 * This should only be called against the righmost leaf extent list.
4562 *
4563 * ocfs2_figure_appending_type() will figure out whether we'll have to
4564 * insert at the tail of the rightmost leaf.
4565 *
4566 * This should also work against the root extent list for tree's with 0
4567 * depth. If we consider the root extent list to be the rightmost leaf node
4568 * then the logic here makes sense.
4569 */
4570static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4571                    struct ocfs2_extent_list *el,
4572                    struct ocfs2_extent_rec *insert_rec)
4573{
4574    int i;
4575    u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4576    struct ocfs2_extent_rec *rec;
4577
4578    insert->ins_appending = APPEND_NONE;
4579
4580    BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4581
4582    if (!el->l_next_free_rec)
4583        goto set_tail_append;
4584
4585    if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4586        /* Were all records empty? */
4587        if (le16_to_cpu(el->l_next_free_rec) == 1)
4588            goto set_tail_append;
4589    }
4590
4591    i = le16_to_cpu(el->l_next_free_rec) - 1;
4592    rec = &el->l_recs[i];
4593
4594    if (cpos >=
4595        (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4596        goto set_tail_append;
4597
4598    return;
4599
4600set_tail_append:
4601    insert->ins_appending = APPEND_TAIL;
4602}
4603
4604/*
4605 * Helper function called at the begining of an insert.
4606 *
4607 * This computes a few things that are commonly used in the process of
4608 * inserting into the btree:
4609 * - Whether the new extent is contiguous with an existing one.
4610 * - The current tree depth.
4611 * - Whether the insert is an appending one.
4612 * - The total # of free records in the tree.
4613 *
4614 * All of the information is stored on the ocfs2_insert_type
4615 * structure.
4616 */
4617static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4618                    struct buffer_head **last_eb_bh,
4619                    struct ocfs2_extent_rec *insert_rec,
4620                    int *free_records,
4621                    struct ocfs2_insert_type *insert)
4622{
4623    int ret;
4624    struct ocfs2_extent_block *eb;
4625    struct ocfs2_extent_list *el;
4626    struct ocfs2_path *path = NULL;
4627    struct buffer_head *bh = NULL;
4628
4629    insert->ins_split = SPLIT_NONE;
4630
4631    el = et->et_root_el;
4632    insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4633
4634    if (el->l_tree_depth) {
4635        /*
4636         * If we have tree depth, we read in the
4637         * rightmost extent block ahead of time as
4638         * ocfs2_figure_insert_type() and ocfs2_add_branch()
4639         * may want it later.
4640         */
4641        ret = ocfs2_read_extent_block(et->et_ci,
4642                          ocfs2_et_get_last_eb_blk(et),
4643                          &bh);
4644        if (ret) {
4645            mlog_exit(ret);
4646            goto out;
4647        }
4648        eb = (struct ocfs2_extent_block *) bh->b_data;
4649        el = &eb->h_list;
4650    }
4651
4652    /*
4653     * Unless we have a contiguous insert, we'll need to know if
4654     * there is room left in our allocation tree for another
4655     * extent record.
4656     *
4657     * XXX: This test is simplistic, we can search for empty
4658     * extent records too.
4659     */
4660    *free_records = le16_to_cpu(el->l_count) -
4661        le16_to_cpu(el->l_next_free_rec);
4662
4663    if (!insert->ins_tree_depth) {
4664        ocfs2_figure_contig_type(et, insert, el, insert_rec);
4665        ocfs2_figure_appending_type(insert, el, insert_rec);
4666        return 0;
4667    }
4668
4669    path = ocfs2_new_path_from_et(et);
4670    if (!path) {
4671        ret = -ENOMEM;
4672        mlog_errno(ret);
4673        goto out;
4674    }
4675
4676    /*
4677     * In the case that we're inserting past what the tree
4678     * currently accounts for, ocfs2_find_path() will return for
4679     * us the rightmost tree path. This is accounted for below in
4680     * the appending code.
4681     */
4682    ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4683    if (ret) {
4684        mlog_errno(ret);
4685        goto out;
4686    }
4687
4688    el = path_leaf_el(path);
4689
4690    /*
4691     * Now that we have the path, there's two things we want to determine:
4692     * 1) Contiguousness (also set contig_index if this is so)
4693     *
4694     * 2) Are we doing an append? We can trivially break this up
4695         * into two types of appends: simple record append, or a
4696         * rotate inside the tail leaf.
4697     */
4698    ocfs2_figure_contig_type(et, insert, el, insert_rec);
4699
4700    /*
4701     * The insert code isn't quite ready to deal with all cases of
4702     * left contiguousness. Specifically, if it's an insert into
4703     * the 1st record in a leaf, it will require the adjustment of
4704     * cluster count on the last record of the path directly to it's
4705     * left. For now, just catch that case and fool the layers
4706     * above us. This works just fine for tree_depth == 0, which
4707     * is why we allow that above.
4708     */
4709    if (insert->ins_contig == CONTIG_LEFT &&
4710        insert->ins_contig_index == 0)
4711        insert->ins_contig = CONTIG_NONE;
4712
4713    /*
4714     * Ok, so we can simply compare against last_eb to figure out
4715     * whether the path doesn't exist. This will only happen in
4716     * the case that we're doing a tail append, so maybe we can
4717     * take advantage of that information somehow.
4718     */
4719    if (ocfs2_et_get_last_eb_blk(et) ==
4720        path_leaf_bh(path)->b_blocknr) {
4721        /*
4722         * Ok, ocfs2_find_path() returned us the rightmost
4723         * tree path. This might be an appending insert. There are
4724         * two cases:
4725         * 1) We're doing a true append at the tail:
4726         * -This might even be off the end of the leaf
4727         * 2) We're "appending" by rotating in the tail
4728         */
4729        ocfs2_figure_appending_type(insert, el, insert_rec);
4730    }
4731
4732out:
4733    ocfs2_free_path(path);
4734
4735    if (ret == 0)
4736        *last_eb_bh = bh;
4737    else
4738        brelse(bh);
4739    return ret;
4740}
4741
4742/*
4743 * Insert an extent into a btree.
4744 *
4745 * The caller needs to update the owning btree's cluster count.
4746 */
4747int ocfs2_insert_extent(handle_t *handle,
4748            struct ocfs2_extent_tree *et,
4749            u32 cpos,
4750            u64 start_blk,
4751            u32 new_clusters,
4752            u8 flags,
4753            struct ocfs2_alloc_context *meta_ac)
4754{
4755    int status;
4756    int uninitialized_var(free_records);
4757    struct buffer_head *last_eb_bh = NULL;
4758    struct ocfs2_insert_type insert = {0, };
4759    struct ocfs2_extent_rec rec;
4760
4761    mlog(0, "add %u clusters at position %u to owner %llu\n",
4762         new_clusters, cpos,
4763         (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4764
4765    memset(&rec, 0, sizeof(rec));
4766    rec.e_cpos = cpu_to_le32(cpos);
4767    rec.e_blkno = cpu_to_le64(start_blk);
4768    rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4769    rec.e_flags = flags;
4770    status = ocfs2_et_insert_check(et, &rec);
4771    if (status) {
4772        mlog_errno(status);
4773        goto bail;
4774    }
4775
4776    status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4777                      &free_records, &insert);
4778    if (status < 0) {
4779        mlog_errno(status);
4780        goto bail;
4781    }
4782
4783    mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4784         "Insert.contig_index: %d, Insert.free_records: %d, "
4785         "Insert.tree_depth: %d\n",
4786         insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4787         free_records, insert.ins_tree_depth);
4788
4789    if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4790        status = ocfs2_grow_tree(handle, et,
4791                     &insert.ins_tree_depth, &last_eb_bh,
4792                     meta_ac);
4793        if (status) {
4794            mlog_errno(status);
4795            goto bail;
4796        }
4797    }
4798
4799    /* Finally, we can add clusters. This might rotate the tree for us. */
4800    status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4801    if (status < 0)
4802        mlog_errno(status);
4803    else
4804        ocfs2_et_extent_map_insert(et, &rec);
4805
4806bail:
4807    brelse(last_eb_bh);
4808
4809    mlog_exit(status);
4810    return status;
4811}
4812
4813/*
4814 * Allcate and add clusters into the extent b-tree.
4815 * The new clusters(clusters_to_add) will be inserted at logical_offset.
4816 * The extent b-tree's root is specified by et, and
4817 * it is not limited to the file storage. Any extent tree can use this
4818 * function if it implements the proper ocfs2_extent_tree.
4819 */
4820int ocfs2_add_clusters_in_btree(handle_t *handle,
4821                struct ocfs2_extent_tree *et,
4822                u32 *logical_offset,
4823                u32 clusters_to_add,
4824                int mark_unwritten,
4825                struct ocfs2_alloc_context *data_ac,
4826                struct ocfs2_alloc_context *meta_ac,
4827                enum ocfs2_alloc_restarted *reason_ret)
4828{
4829    int status = 0;
4830    int free_extents;
4831    enum ocfs2_alloc_restarted reason = RESTART_NONE;
4832    u32 bit_off, num_bits;
4833    u64 block;
4834    u8 flags = 0;
4835    struct ocfs2_super *osb =
4836        OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4837
4838    BUG_ON(!clusters_to_add);
4839
4840    if (mark_unwritten)
4841        flags = OCFS2_EXT_UNWRITTEN;
4842
4843    free_extents = ocfs2_num_free_extents(osb, et);
4844    if (free_extents < 0) {
4845        status = free_extents;
4846        mlog_errno(status);
4847        goto leave;
4848    }
4849
4850    /* there are two cases which could cause us to EAGAIN in the
4851     * we-need-more-metadata case:
4852     * 1) we haven't reserved *any*
4853     * 2) we are so fragmented, we've needed to add metadata too
4854     * many times. */
4855    if (!free_extents && !meta_ac) {
4856        mlog(0, "we haven't reserved any metadata!\n");
4857        status = -EAGAIN;
4858        reason = RESTART_META;
4859        goto leave;
4860    } else if ((!free_extents)
4861           && (ocfs2_alloc_context_bits_left(meta_ac)
4862               < ocfs2_extend_meta_needed(et->et_root_el))) {
4863        mlog(0, "filesystem is really fragmented...\n");
4864        status = -EAGAIN;
4865        reason = RESTART_META;
4866        goto leave;
4867    }
4868
4869    status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4870                    clusters_to_add, &bit_off, &num_bits);
4871    if (status < 0) {
4872        if (status != -ENOSPC)
4873            mlog_errno(status);
4874        goto leave;
4875    }
4876
4877    BUG_ON(num_bits > clusters_to_add);
4878
4879    /* reserve our write early -- insert_extent may update the tree root */
4880    status = ocfs2_et_root_journal_access(handle, et,
4881                          OCFS2_JOURNAL_ACCESS_WRITE);
4882    if (status < 0) {
4883        mlog_errno(status);
4884        goto leave;
4885    }
4886
4887    block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4888    mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
4889         num_bits, bit_off,
4890         (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4891    status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4892                     num_bits, flags, meta_ac);
4893    if (status < 0) {
4894        mlog_errno(status);
4895        goto leave;
4896    }
4897
4898    status = ocfs2_journal_dirty(handle, et->et_root_bh);
4899    if (status < 0) {
4900        mlog_errno(status);
4901        goto leave;
4902    }
4903
4904    clusters_to_add -= num_bits;
4905    *logical_offset += num_bits;
4906
4907    if (clusters_to_add) {
4908        mlog(0, "need to alloc once more, wanted = %u\n",
4909             clusters_to_add);
4910        status = -EAGAIN;
4911        reason = RESTART_TRANS;
4912    }
4913
4914leave:
4915    mlog_exit(status);
4916    if (reason_ret)
4917        *reason_ret = reason;
4918    return status;
4919}
4920
4921static void ocfs2_make_right_split_rec(struct super_block *sb,
4922                       struct ocfs2_extent_rec *split_rec,
4923                       u32 cpos,
4924                       struct ocfs2_extent_rec *rec)
4925{
4926    u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4927    u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4928
4929    memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4930
4931    split_rec->e_cpos = cpu_to_le32(cpos);
4932    split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4933
4934    split_rec->e_blkno = rec->e_blkno;
4935    le64_add_cpu(&split_rec->e_blkno,
4936             ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4937
4938    split_rec->e_flags = rec->e_flags;
4939}
4940
4941static int ocfs2_split_and_insert(handle_t *handle,
4942                  struct ocfs2_extent_tree *et,
4943                  struct ocfs2_path *path,
4944                  struct buffer_head **last_eb_bh,
4945                  int split_index,
4946                  struct ocfs2_extent_rec *orig_split_rec,
4947                  struct ocfs2_alloc_context *meta_ac)
4948{
4949    int ret = 0, depth;
4950    unsigned int insert_range, rec_range, do_leftright = 0;
4951    struct ocfs2_extent_rec tmprec;
4952    struct ocfs2_extent_list *rightmost_el;
4953    struct ocfs2_extent_rec rec;
4954    struct ocfs2_extent_rec split_rec = *orig_split_rec;
4955    struct ocfs2_insert_type insert;
4956    struct ocfs2_extent_block *eb;
4957
4958leftright:
4959    /*
4960     * Store a copy of the record on the stack - it might move
4961     * around as the tree is manipulated below.
4962     */
4963    rec = path_leaf_el(path)->l_recs[split_index];
4964
4965    rightmost_el = et->et_root_el;
4966
4967    depth = le16_to_cpu(rightmost_el->l_tree_depth);
4968    if (depth) {
4969        BUG_ON(!(*last_eb_bh));
4970        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4971        rightmost_el = &eb->h_list;
4972    }
4973
4974    if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4975        le16_to_cpu(rightmost_el->l_count)) {
4976        ret = ocfs2_grow_tree(handle, et,
4977                      &depth, last_eb_bh, meta_ac);
4978        if (ret) {
4979            mlog_errno(ret);
4980            goto out;
4981        }
4982    }
4983
4984    memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4985    insert.ins_appending = APPEND_NONE;
4986    insert.ins_contig = CONTIG_NONE;
4987    insert.ins_tree_depth = depth;
4988
4989    insert_range = le32_to_cpu(split_rec.e_cpos) +
4990        le16_to_cpu(split_rec.e_leaf_clusters);
4991    rec_range = le32_to_cpu(rec.e_cpos) +
4992        le16_to_cpu(rec.e_leaf_clusters);
4993
4994    if (split_rec.e_cpos == rec.e_cpos) {
4995        insert.ins_split = SPLIT_LEFT;
4996    } else if (insert_range == rec_range) {
4997        insert.ins_split = SPLIT_RIGHT;
4998    } else {
4999        /*
5000         * Left/right split. We fake this as a right split
5001         * first and then make a second pass as a left split.
5002         */
5003        insert.ins_split = SPLIT_RIGHT;
5004
5005        ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5006                       &tmprec, insert_range, &rec);
5007
5008        split_rec = tmprec;
5009
5010        BUG_ON(do_leftright);
5011        do_leftright = 1;
5012    }
5013
5014    ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5015    if (ret) {
5016        mlog_errno(ret);
5017        goto out;
5018    }
5019
5020    if (do_leftright == 1) {
5021        u32 cpos;
5022        struct ocfs2_extent_list *el;
5023
5024        do_leftright++;
5025        split_rec = *orig_split_rec;
5026
5027        ocfs2_reinit_path(path, 1);
5028
5029        cpos = le32_to_cpu(split_rec.e_cpos);
5030        ret = ocfs2_find_path(et->et_ci, path, cpos);
5031        if (ret) {
5032            mlog_errno(ret);
5033            goto out;
5034        }
5035
5036        el = path_leaf_el(path);
5037        split_index = ocfs2_search_extent_list(el, cpos);
5038        goto leftright;
5039    }
5040out:
5041
5042    return ret;
5043}
5044
5045static int ocfs2_replace_extent_rec(handle_t *handle,
5046                    struct ocfs2_extent_tree *et,
5047                    struct ocfs2_path *path,
5048                    struct ocfs2_extent_list *el,
5049                    int split_index,
5050                    struct ocfs2_extent_rec *split_rec)
5051{
5052    int ret;
5053
5054    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
5055                       path_num_items(path) - 1);
5056    if (ret) {
5057        mlog_errno(ret);
5058        goto out;
5059    }
5060
5061    el->l_recs[split_index] = *split_rec;
5062
5063    ocfs2_journal_dirty(handle, path_leaf_bh(path));
5064out:
5065    return ret;
5066}
5067
5068/*
5069 * Split part or all of the extent record at split_index in the leaf
5070 * pointed to by path. Merge with the contiguous extent record if needed.
5071 *
5072 * Care is taken to handle contiguousness so as to not grow the tree.
5073 *
5074 * meta_ac is not strictly necessary - we only truly need it if growth
5075 * of the tree is required. All other cases will degrade into a less
5076 * optimal tree layout.
5077 *
5078 * last_eb_bh should be the rightmost leaf block for any extent
5079 * btree. Since a split may grow the tree or a merge might shrink it,
5080 * the caller cannot trust the contents of that buffer after this call.
5081 *
5082 * This code is optimized for readability - several passes might be
5083 * made over certain portions of the tree. All of those blocks will
5084 * have been brought into cache (and pinned via the journal), so the
5085 * extra overhead is not expressed in terms of disk reads.
5086 */
5087int ocfs2_split_extent(handle_t *handle,
5088               struct ocfs2_extent_tree *et,
5089               struct ocfs2_path *path,
5090               int split_index,
5091               struct ocfs2_extent_rec *split_rec,
5092               struct ocfs2_alloc_context *meta_ac,
5093               struct ocfs2_cached_dealloc_ctxt *dealloc)
5094{
5095    int ret = 0;
5096    struct ocfs2_extent_list *el = path_leaf_el(path);
5097    struct buffer_head *last_eb_bh = NULL;
5098    struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5099    struct ocfs2_merge_ctxt ctxt;
5100    struct ocfs2_extent_list *rightmost_el;
5101
5102    if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5103        ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5104         (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5105        ret = -EIO;
5106        mlog_errno(ret);
5107        goto out;
5108    }
5109
5110    ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5111                                split_index,
5112                                split_rec);
5113
5114    /*
5115     * The core merge / split code wants to know how much room is
5116     * left in this allocation tree, so we pass the
5117     * rightmost extent list.
5118     */
5119    if (path->p_tree_depth) {
5120        struct ocfs2_extent_block *eb;
5121
5122        ret = ocfs2_read_extent_block(et->et_ci,
5123                          ocfs2_et_get_last_eb_blk(et),
5124                          &last_eb_bh);
5125        if (ret) {
5126            mlog_exit(ret);
5127            goto out;
5128        }
5129
5130        eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5131        rightmost_el = &eb->h_list;
5132    } else
5133        rightmost_el = path_root_el(path);
5134
5135    if (rec->e_cpos == split_rec->e_cpos &&
5136        rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5137        ctxt.c_split_covers_rec = 1;
5138    else
5139        ctxt.c_split_covers_rec = 0;
5140
5141    ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5142
5143    mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
5144         split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
5145         ctxt.c_split_covers_rec);
5146
5147    if (ctxt.c_contig_type == CONTIG_NONE) {
5148        if (ctxt.c_split_covers_rec)
5149            ret = ocfs2_replace_extent_rec(handle, et, path, el,
5150                               split_index, split_rec);
5151        else
5152            ret = ocfs2_split_and_insert(handle, et, path,
5153                             &last_eb_bh, split_index,
5154                             split_rec, meta_ac);
5155        if (ret)
5156            mlog_errno(ret);
5157    } else {
5158        ret = ocfs2_try_to_merge_extent(handle, et, path,
5159                        split_index, split_rec,
5160                        dealloc, &ctxt);
5161        if (ret)
5162            mlog_errno(ret);
5163    }
5164
5165out:
5166    brelse(last_eb_bh);
5167    return ret;
5168}
5169
5170/*
5171 * Change the flags of the already-existing extent at cpos for len clusters.
5172 *
5173 * new_flags: the flags we want to set.
5174 * clear_flags: the flags we want to clear.
5175 * phys: the new physical offset we want this new extent starts from.
5176 *
5177 * If the existing extent is larger than the request, initiate a
5178 * split. An attempt will be made at merging with adjacent extents.
5179 *
5180 * The caller is responsible for passing down meta_ac if we'll need it.
5181 */
5182int ocfs2_change_extent_flag(handle_t *handle,
5183                 struct ocfs2_extent_tree *et,
5184                 u32 cpos, u32 len, u32 phys,
5185                 struct ocfs2_alloc_context *meta_ac,
5186                 struct ocfs2_cached_dealloc_ctxt *dealloc,
5187                 int new_flags, int clear_flags)
5188{
5189    int ret, index;
5190    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5191    u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5192    struct ocfs2_extent_rec split_rec;
5193    struct ocfs2_path *left_path = NULL;
5194    struct ocfs2_extent_list *el;
5195    struct ocfs2_extent_rec *rec;
5196
5197    left_path = ocfs2_new_path_from_et(et);
5198    if (!left_path) {
5199        ret = -ENOMEM;
5200        mlog_errno(ret);
5201        goto out;
5202    }
5203
5204    ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5205    if (ret) {
5206        mlog_errno(ret);
5207        goto out;
5208    }
5209    el = path_leaf_el(left_path);
5210
5211    index = ocfs2_search_extent_list(el, cpos);
5212    if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5213        ocfs2_error(sb,
5214                "Owner %llu has an extent at cpos %u which can no "
5215                "longer be found.\n",
5216                 (unsigned long long)
5217                 ocfs2_metadata_cache_owner(et->et_ci), cpos);
5218        ret = -EROFS;
5219        goto out;
5220    }
5221
5222    ret = -EIO;
5223    rec = &el->l_recs[index];
5224    if (new_flags && (rec->e_flags & new_flags)) {
5225        mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5226             "extent that already had them",
5227             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5228             new_flags);
5229        goto out;
5230    }
5231
5232    if (clear_flags && !(rec->e_flags & clear_flags)) {
5233        mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5234             "extent that didn't have them",
5235             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5236             clear_flags);
5237        goto out;
5238    }
5239
5240    memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5241    split_rec.e_cpos = cpu_to_le32(cpos);
5242    split_rec.e_leaf_clusters = cpu_to_le16(len);
5243    split_rec.e_blkno = cpu_to_le64(start_blkno);
5244    split_rec.e_flags = rec->e_flags;
5245    if (new_flags)
5246        split_rec.e_flags |= new_flags;
5247    if (clear_flags)
5248        split_rec.e_flags &= ~clear_flags;
5249
5250    ret = ocfs2_split_extent(handle, et, left_path,
5251                 index, &split_rec, meta_ac,
5252                 dealloc);
5253    if (ret)
5254        mlog_errno(ret);
5255
5256out:
5257    ocfs2_free_path(left_path);
5258    return ret;
5259
5260}
5261
5262/*
5263 * Mark the already-existing extent at cpos as written for len clusters.
5264 * This removes the unwritten extent flag.
5265 *
5266 * If the existing extent is larger than the request, initiate a
5267 * split. An attempt will be made at merging with adjacent extents.
5268 *
5269 * The caller is responsible for passing down meta_ac if we'll need it.
5270 */
5271int ocfs2_mark_extent_written(struct inode *inode,
5272                  struct ocfs2_extent_tree *et,
5273                  handle_t *handle, u32 cpos, u32 len, u32 phys,
5274                  struct ocfs2_alloc_context *meta_ac,
5275                  struct ocfs2_cached_dealloc_ctxt *dealloc)
5276{
5277    int ret;
5278
5279    mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
5280         inode->i_ino, cpos, len, phys);
5281
5282    if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5283        ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5284                "that are being written to, but the feature bit "
5285                "is not set in the super block.",
5286                (unsigned long long)OCFS2_I(inode)->ip_blkno);
5287        ret = -EROFS;
5288        goto out;
5289    }
5290
5291    /*
5292     * XXX: This should be fixed up so that we just re-insert the
5293     * next extent records.
5294     */
5295    ocfs2_et_extent_map_truncate(et, 0);
5296
5297    ret = ocfs2_change_extent_flag(handle, et, cpos,
5298                       len, phys, meta_ac, dealloc,
5299                       0, OCFS2_EXT_UNWRITTEN);
5300    if (ret)
5301        mlog_errno(ret);
5302
5303out:
5304    return ret;
5305}
5306
5307static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5308                struct ocfs2_path *path,
5309                int index, u32 new_range,
5310                struct ocfs2_alloc_context *meta_ac)
5311{
5312    int ret, depth, credits = handle->h_buffer_credits;
5313    struct buffer_head *last_eb_bh = NULL;
5314    struct ocfs2_extent_block *eb;
5315    struct ocfs2_extent_list *rightmost_el, *el;
5316    struct ocfs2_extent_rec split_rec;
5317    struct ocfs2_extent_rec *rec;
5318    struct ocfs2_insert_type insert;
5319
5320    /*
5321     * Setup the record to split before we grow the tree.
5322     */
5323    el = path_leaf_el(path);
5324    rec = &el->l_recs[index];
5325    ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5326                   &split_rec, new_range, rec);
5327
5328    depth = path->p_tree_depth;
5329    if (depth > 0) {
5330        ret = ocfs2_read_extent_block(et->et_ci,
5331                          ocfs2_et_get_last_eb_blk(et),
5332                          &last_eb_bh);
5333        if (ret < 0) {
5334            mlog_errno(ret);
5335            goto out;
5336        }
5337
5338        eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5339        rightmost_el = &eb->h_list;
5340    } else
5341        rightmost_el = path_leaf_el(path);
5342
5343    credits += path->p_tree_depth +
5344           ocfs2_extend_meta_needed(et->et_root_el);
5345    ret = ocfs2_extend_trans(handle, credits);
5346    if (ret) {
5347        mlog_errno(ret);
5348        goto out;
5349    }
5350
5351    if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5352        le16_to_cpu(rightmost_el->l_count)) {
5353        ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5354                      meta_ac);
5355        if (ret) {
5356            mlog_errno(ret);
5357            goto out;
5358        }
5359    }
5360
5361    memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5362    insert.ins_appending = APPEND_NONE;
5363    insert.ins_contig = CONTIG_NONE;
5364    insert.ins_split = SPLIT_RIGHT;
5365    insert.ins_tree_depth = depth;
5366
5367    ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5368    if (ret)
5369        mlog_errno(ret);
5370
5371out:
5372    brelse(last_eb_bh);
5373    return ret;
5374}
5375
5376static int ocfs2_truncate_rec(handle_t *handle,
5377                  struct ocfs2_extent_tree *et,
5378                  struct ocfs2_path *path, int index,
5379                  struct ocfs2_cached_dealloc_ctxt *dealloc,
5380                  u32 cpos, u32 len)
5381{
5382    int ret;
5383    u32 left_cpos, rec_range, trunc_range;
5384    int wants_rotate = 0, is_rightmost_tree_rec = 0;
5385    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5386    struct ocfs2_path *left_path = NULL;
5387    struct ocfs2_extent_list *el = path_leaf_el(path);
5388    struct ocfs2_extent_rec *rec;
5389    struct ocfs2_extent_block *eb;
5390
5391    if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5392        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5393        if (ret) {
5394            mlog_errno(ret);
5395            goto out;
5396        }
5397
5398        index--;
5399    }
5400
5401    if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5402        path->p_tree_depth) {
5403        /*
5404         * Check whether this is the rightmost tree record. If
5405         * we remove all of this record or part of its right
5406         * edge then an update of the record lengths above it
5407         * will be required.
5408         */
5409        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5410        if (eb->h_next_leaf_blk == 0)
5411            is_rightmost_tree_rec = 1;
5412    }
5413
5414    rec = &el->l_recs[index];
5415    if (index == 0 && path->p_tree_depth &&
5416        le32_to_cpu(rec->e_cpos) == cpos) {
5417        /*
5418         * Changing the leftmost offset (via partial or whole
5419         * record truncate) of an interior (or rightmost) path
5420         * means we have to update the subtree that is formed
5421         * by this leaf and the one to it's left.
5422         *
5423         * There are two cases we can skip:
5424         * 1) Path is the leftmost one in our btree.
5425         * 2) The leaf is rightmost and will be empty after
5426         * we remove the extent record - the rotate code
5427         * knows how to update the newly formed edge.
5428         */
5429
5430        ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5431        if (ret) {
5432            mlog_errno(ret);
5433            goto out;
5434        }
5435
5436        if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5437            left_path = ocfs2_new_path_from_path(path);
5438            if (!left_path) {
5439                ret = -ENOMEM;
5440                mlog_errno(ret);
5441                goto out;
5442            }
5443
5444            ret = ocfs2_find_path(et->et_ci, left_path,
5445                          left_cpos);
5446            if (ret) {
5447                mlog_errno(ret);
5448                goto out;
5449            }
5450        }
5451    }
5452
5453    ret = ocfs2_extend_rotate_transaction(handle, 0,
5454                          handle->h_buffer_credits,
5455                          path);
5456    if (ret) {
5457        mlog_errno(ret);
5458        goto out;
5459    }
5460
5461    ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5462    if (ret) {
5463        mlog_errno(ret);
5464        goto out;
5465    }
5466
5467    ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5468    if (ret) {
5469        mlog_errno(ret);
5470        goto out;
5471    }
5472
5473    rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5474    trunc_range = cpos + len;
5475
5476    if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5477        int next_free;
5478
5479        memset(rec, 0, sizeof(*rec));
5480        ocfs2_cleanup_merge(el, index);
5481        wants_rotate = 1;
5482
5483        next_free = le16_to_cpu(el->l_next_free_rec);
5484        if (is_rightmost_tree_rec && next_free > 1) {
5485            /*
5486             * We skip the edge update if this path will
5487             * be deleted by the rotate code.
5488             */
5489            rec = &el->l_recs[next_free - 1];
5490            ocfs2_adjust_rightmost_records(handle, et, path,
5491                               rec);
5492        }
5493    } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5494        /* Remove leftmost portion of the record. */
5495        le32_add_cpu(&rec->e_cpos, len);
5496        le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5497        le16_add_cpu(&rec->e_leaf_clusters, -len);
5498    } else if (rec_range == trunc_range) {
5499        /* Remove rightmost portion of the record */
5500        le16_add_cpu(&rec->e_leaf_clusters, -len);
5501        if (is_rightmost_tree_rec)
5502            ocfs2_adjust_rightmost_records(handle, et, path, rec);
5503    } else {
5504        /* Caller should have trapped this. */
5505        mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5506             "(%u, %u)\n",
5507             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5508             le32_to_cpu(rec->e_cpos),
5509             le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5510        BUG();
5511    }
5512
5513    if (left_path) {
5514        int subtree_index;
5515
5516        subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5517        ocfs2_complete_edge_insert(handle, left_path, path,
5518                       subtree_index);
5519    }
5520
5521    ocfs2_journal_dirty(handle, path_leaf_bh(path));
5522
5523    ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5524    if (ret) {
5525        mlog_errno(ret);
5526        goto out;
5527    }
5528
5529out:
5530    ocfs2_free_path(left_path);
5531    return ret;
5532}
5533
5534int ocfs2_remove_extent(handle_t *handle,
5535            struct ocfs2_extent_tree *et,
5536            u32 cpos, u32 len,
5537            struct ocfs2_alloc_context *meta_ac,
5538            struct ocfs2_cached_dealloc_ctxt *dealloc)
5539{
5540    int ret, index;
5541    u32 rec_range, trunc_range;
5542    struct ocfs2_extent_rec *rec;
5543    struct ocfs2_extent_list *el;
5544    struct ocfs2_path *path = NULL;
5545
5546    /*
5547     * XXX: Why are we truncating to 0 instead of wherever this
5548     * affects us?
5549     */
5550    ocfs2_et_extent_map_truncate(et, 0);
5551
5552    path = ocfs2_new_path_from_et(et);
5553    if (!path) {
5554        ret = -ENOMEM;
5555        mlog_errno(ret);
5556        goto out;
5557    }
5558
5559    ret = ocfs2_find_path(et->et_ci, path, cpos);
5560    if (ret) {
5561        mlog_errno(ret);
5562        goto out;
5563    }
5564
5565    el = path_leaf_el(path);
5566    index = ocfs2_search_extent_list(el, cpos);
5567    if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5568        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5569                "Owner %llu has an extent at cpos %u which can no "
5570                "longer be found.\n",
5571                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5572                cpos);
5573        ret = -EROFS;
5574        goto out;
5575    }
5576
5577    /*
5578     * We have 3 cases of extent removal:
5579     * 1) Range covers the entire extent rec
5580     * 2) Range begins or ends on one edge of the extent rec
5581     * 3) Range is in the middle of the extent rec (no shared edges)
5582     *
5583     * For case 1 we remove the extent rec and left rotate to
5584     * fill the hole.
5585     *
5586     * For case 2 we just shrink the existing extent rec, with a
5587     * tree update if the shrinking edge is also the edge of an
5588     * extent block.
5589     *
5590     * For case 3 we do a right split to turn the extent rec into
5591     * something case 2 can handle.
5592     */
5593    rec = &el->l_recs[index];
5594    rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5595    trunc_range = cpos + len;
5596
5597    BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5598
5599    mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
5600         "(cpos %u, len %u)\n",
5601         (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5602         cpos, len, index,
5603         le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5604
5605    if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5606        ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5607                     cpos, len);