Root/fs/ocfs2/alloc.c

1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * alloc.c
5 *
6 * Extent allocs and frees
7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/fs.h>
27#include <linux/types.h>
28#include <linux/slab.h>
29#include <linux/highmem.h>
30#include <linux/swap.h>
31#include <linux/quotaops.h>
32#include <linux/blkdev.h>
33
34#include <cluster/masklog.h>
35
36#include "ocfs2.h"
37
38#include "alloc.h"
39#include "aops.h"
40#include "blockcheck.h"
41#include "dlmglue.h"
42#include "extent_map.h"
43#include "inode.h"
44#include "journal.h"
45#include "localalloc.h"
46#include "suballoc.h"
47#include "sysfile.h"
48#include "file.h"
49#include "super.h"
50#include "uptodate.h"
51#include "xattr.h"
52#include "refcounttree.h"
53#include "ocfs2_trace.h"
54
55#include "buffer_head_io.h"
56
57enum ocfs2_contig_type {
58    CONTIG_NONE = 0,
59    CONTIG_LEFT,
60    CONTIG_RIGHT,
61    CONTIG_LEFTRIGHT,
62};
63
64static enum ocfs2_contig_type
65    ocfs2_extent_rec_contig(struct super_block *sb,
66                struct ocfs2_extent_rec *ext,
67                struct ocfs2_extent_rec *insert_rec);
68/*
69 * Operations for a specific extent tree type.
70 *
71 * To implement an on-disk btree (extent tree) type in ocfs2, add
72 * an ocfs2_extent_tree_operations structure and the matching
73 * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it
74 * for the allocation portion of the extent tree.
75 */
76struct ocfs2_extent_tree_operations {
77    /*
78     * last_eb_blk is the block number of the right most leaf extent
79     * block. Most on-disk structures containing an extent tree store
80     * this value for fast access. The ->eo_set_last_eb_blk() and
81     * ->eo_get_last_eb_blk() operations access this value. They are
82     * both required.
83     */
84    void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
85                   u64 blkno);
86    u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
87
88    /*
89     * The on-disk structure usually keeps track of how many total
90     * clusters are stored in this extent tree. This function updates
91     * that value. new_clusters is the delta, and must be
92     * added to the total. Required.
93     */
94    void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
95                   u32 new_clusters);
96
97    /*
98     * If this extent tree is supported by an extent map, insert
99     * a record into the map.
100     */
101    void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
102                     struct ocfs2_extent_rec *rec);
103
104    /*
105     * If this extent tree is supported by an extent map, truncate the
106     * map to clusters,
107     */
108    void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
109                       u32 clusters);
110
111    /*
112     * If ->eo_insert_check() exists, it is called before rec is
113     * inserted into the extent tree. It is optional.
114     */
115    int (*eo_insert_check)(struct ocfs2_extent_tree *et,
116                   struct ocfs2_extent_rec *rec);
117    int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
118
119    /*
120     * --------------------------------------------------------------
121     * The remaining are internal to ocfs2_extent_tree and don't have
122     * accessor functions
123     */
124
125    /*
126     * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
127     * It is required.
128     */
129    void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
130
131    /*
132     * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
133     * it exists. If it does not, et->et_max_leaf_clusters is set
134     * to 0 (unlimited). Optional.
135     */
136    void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
137
138    /*
139     * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
140     * are contiguous or not. Optional. Don't need to set it if use
141     * ocfs2_extent_rec as the tree leaf.
142     */
143    enum ocfs2_contig_type
144        (*eo_extent_contig)(struct ocfs2_extent_tree *et,
145                    struct ocfs2_extent_rec *ext,
146                    struct ocfs2_extent_rec *insert_rec);
147};
148
149
150/*
151 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
152 * in the methods.
153 */
154static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
155static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
156                     u64 blkno);
157static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
158                     u32 clusters);
159static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
160                       struct ocfs2_extent_rec *rec);
161static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
162                         u32 clusters);
163static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
164                     struct ocfs2_extent_rec *rec);
165static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
166static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
167static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
168    .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
169    .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
170    .eo_update_clusters = ocfs2_dinode_update_clusters,
171    .eo_extent_map_insert = ocfs2_dinode_extent_map_insert,
172    .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
173    .eo_insert_check = ocfs2_dinode_insert_check,
174    .eo_sanity_check = ocfs2_dinode_sanity_check,
175    .eo_fill_root_el = ocfs2_dinode_fill_root_el,
176};
177
178static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
179                     u64 blkno)
180{
181    struct ocfs2_dinode *di = et->et_object;
182
183    BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
184    di->i_last_eb_blk = cpu_to_le64(blkno);
185}
186
187static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
188{
189    struct ocfs2_dinode *di = et->et_object;
190
191    BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
192    return le64_to_cpu(di->i_last_eb_blk);
193}
194
195static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
196                     u32 clusters)
197{
198    struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
199    struct ocfs2_dinode *di = et->et_object;
200
201    le32_add_cpu(&di->i_clusters, clusters);
202    spin_lock(&oi->ip_lock);
203    oi->ip_clusters = le32_to_cpu(di->i_clusters);
204    spin_unlock(&oi->ip_lock);
205}
206
207static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
208                       struct ocfs2_extent_rec *rec)
209{
210    struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
211
212    ocfs2_extent_map_insert_rec(inode, rec);
213}
214
215static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
216                         u32 clusters)
217{
218    struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
219
220    ocfs2_extent_map_trunc(inode, clusters);
221}
222
223static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
224                     struct ocfs2_extent_rec *rec)
225{
226    struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
227    struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
228
229    BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
230    mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
231            (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
232            "Device %s, asking for sparse allocation: inode %llu, "
233            "cpos %u, clusters %u\n",
234            osb->dev_str,
235            (unsigned long long)oi->ip_blkno,
236            rec->e_cpos, oi->ip_clusters);
237
238    return 0;
239}
240
241static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
242{
243    struct ocfs2_dinode *di = et->et_object;
244
245    BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
246    BUG_ON(!OCFS2_IS_VALID_DINODE(di));
247
248    return 0;
249}
250
251static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
252{
253    struct ocfs2_dinode *di = et->et_object;
254
255    et->et_root_el = &di->id2.i_list;
256}
257
258
259static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
260{
261    struct ocfs2_xattr_value_buf *vb = et->et_object;
262
263    et->et_root_el = &vb->vb_xv->xr_list;
264}
265
266static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
267                          u64 blkno)
268{
269    struct ocfs2_xattr_value_buf *vb = et->et_object;
270
271    vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
272}
273
274static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
275{
276    struct ocfs2_xattr_value_buf *vb = et->et_object;
277
278    return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
279}
280
281static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
282                          u32 clusters)
283{
284    struct ocfs2_xattr_value_buf *vb = et->et_object;
285
286    le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
287}
288
289static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
290    .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
291    .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
292    .eo_update_clusters = ocfs2_xattr_value_update_clusters,
293    .eo_fill_root_el = ocfs2_xattr_value_fill_root_el,
294};
295
296static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
297{
298    struct ocfs2_xattr_block *xb = et->et_object;
299
300    et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
301}
302
303static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
304{
305    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
306    et->et_max_leaf_clusters =
307        ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
308}
309
310static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
311                         u64 blkno)
312{
313    struct ocfs2_xattr_block *xb = et->et_object;
314    struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
315
316    xt->xt_last_eb_blk = cpu_to_le64(blkno);
317}
318
319static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
320{
321    struct ocfs2_xattr_block *xb = et->et_object;
322    struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
323
324    return le64_to_cpu(xt->xt_last_eb_blk);
325}
326
327static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
328                         u32 clusters)
329{
330    struct ocfs2_xattr_block *xb = et->et_object;
331
332    le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
333}
334
335static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
336    .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
337    .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
338    .eo_update_clusters = ocfs2_xattr_tree_update_clusters,
339    .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el,
340    .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
341};
342
343static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
344                      u64 blkno)
345{
346    struct ocfs2_dx_root_block *dx_root = et->et_object;
347
348    dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
349}
350
351static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
352{
353    struct ocfs2_dx_root_block *dx_root = et->et_object;
354
355    return le64_to_cpu(dx_root->dr_last_eb_blk);
356}
357
358static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
359                      u32 clusters)
360{
361    struct ocfs2_dx_root_block *dx_root = et->et_object;
362
363    le32_add_cpu(&dx_root->dr_clusters, clusters);
364}
365
366static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
367{
368    struct ocfs2_dx_root_block *dx_root = et->et_object;
369
370    BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
371
372    return 0;
373}
374
375static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
376{
377    struct ocfs2_dx_root_block *dx_root = et->et_object;
378
379    et->et_root_el = &dx_root->dr_list;
380}
381
382static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
383    .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk,
384    .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk,
385    .eo_update_clusters = ocfs2_dx_root_update_clusters,
386    .eo_sanity_check = ocfs2_dx_root_sanity_check,
387    .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
388};
389
390static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
391{
392    struct ocfs2_refcount_block *rb = et->et_object;
393
394    et->et_root_el = &rb->rf_list;
395}
396
397static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
398                        u64 blkno)
399{
400    struct ocfs2_refcount_block *rb = et->et_object;
401
402    rb->rf_last_eb_blk = cpu_to_le64(blkno);
403}
404
405static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
406{
407    struct ocfs2_refcount_block *rb = et->et_object;
408
409    return le64_to_cpu(rb->rf_last_eb_blk);
410}
411
412static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
413                        u32 clusters)
414{
415    struct ocfs2_refcount_block *rb = et->et_object;
416
417    le32_add_cpu(&rb->rf_clusters, clusters);
418}
419
420static enum ocfs2_contig_type
421ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
422                  struct ocfs2_extent_rec *ext,
423                  struct ocfs2_extent_rec *insert_rec)
424{
425    return CONTIG_NONE;
426}
427
428static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
429    .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
430    .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
431    .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
432    .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el,
433    .eo_extent_contig = ocfs2_refcount_tree_extent_contig,
434};
435
436static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
437                     struct ocfs2_caching_info *ci,
438                     struct buffer_head *bh,
439                     ocfs2_journal_access_func access,
440                     void *obj,
441                     struct ocfs2_extent_tree_operations *ops)
442{
443    et->et_ops = ops;
444    et->et_root_bh = bh;
445    et->et_ci = ci;
446    et->et_root_journal_access = access;
447    if (!obj)
448        obj = (void *)bh->b_data;
449    et->et_object = obj;
450
451    et->et_ops->eo_fill_root_el(et);
452    if (!et->et_ops->eo_fill_max_leaf_clusters)
453        et->et_max_leaf_clusters = 0;
454    else
455        et->et_ops->eo_fill_max_leaf_clusters(et);
456}
457
458void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
459                   struct ocfs2_caching_info *ci,
460                   struct buffer_head *bh)
461{
462    __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
463                 NULL, &ocfs2_dinode_et_ops);
464}
465
466void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
467                       struct ocfs2_caching_info *ci,
468                       struct buffer_head *bh)
469{
470    __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
471                 NULL, &ocfs2_xattr_tree_et_ops);
472}
473
474void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
475                    struct ocfs2_caching_info *ci,
476                    struct ocfs2_xattr_value_buf *vb)
477{
478    __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
479                 &ocfs2_xattr_value_et_ops);
480}
481
482void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
483                    struct ocfs2_caching_info *ci,
484                    struct buffer_head *bh)
485{
486    __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
487                 NULL, &ocfs2_dx_root_et_ops);
488}
489
490void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
491                     struct ocfs2_caching_info *ci,
492                     struct buffer_head *bh)
493{
494    __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
495                 NULL, &ocfs2_refcount_tree_et_ops);
496}
497
498static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
499                        u64 new_last_eb_blk)
500{
501    et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
502}
503
504static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
505{
506    return et->et_ops->eo_get_last_eb_blk(et);
507}
508
509static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
510                        u32 clusters)
511{
512    et->et_ops->eo_update_clusters(et, clusters);
513}
514
515static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
516                          struct ocfs2_extent_rec *rec)
517{
518    if (et->et_ops->eo_extent_map_insert)
519        et->et_ops->eo_extent_map_insert(et, rec);
520}
521
522static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
523                        u32 clusters)
524{
525    if (et->et_ops->eo_extent_map_truncate)
526        et->et_ops->eo_extent_map_truncate(et, clusters);
527}
528
529static inline int ocfs2_et_root_journal_access(handle_t *handle,
530                           struct ocfs2_extent_tree *et,
531                           int type)
532{
533    return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
534                      type);
535}
536
537static inline enum ocfs2_contig_type
538    ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
539                   struct ocfs2_extent_rec *rec,
540                   struct ocfs2_extent_rec *insert_rec)
541{
542    if (et->et_ops->eo_extent_contig)
543        return et->et_ops->eo_extent_contig(et, rec, insert_rec);
544
545    return ocfs2_extent_rec_contig(
546                ocfs2_metadata_cache_get_super(et->et_ci),
547                rec, insert_rec);
548}
549
550static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
551                    struct ocfs2_extent_rec *rec)
552{
553    int ret = 0;
554
555    if (et->et_ops->eo_insert_check)
556        ret = et->et_ops->eo_insert_check(et, rec);
557    return ret;
558}
559
560static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
561{
562    int ret = 0;
563
564    if (et->et_ops->eo_sanity_check)
565        ret = et->et_ops->eo_sanity_check(et);
566    return ret;
567}
568
569static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
570                     struct ocfs2_extent_block *eb);
571static void ocfs2_adjust_rightmost_records(handle_t *handle,
572                       struct ocfs2_extent_tree *et,
573                       struct ocfs2_path *path,
574                       struct ocfs2_extent_rec *insert_rec);
575/*
576 * Reset the actual path elements so that we can re-use the structure
577 * to build another path. Generally, this involves freeing the buffer
578 * heads.
579 */
580void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
581{
582    int i, start = 0, depth = 0;
583    struct ocfs2_path_item *node;
584
585    if (keep_root)
586        start = 1;
587
588    for(i = start; i < path_num_items(path); i++) {
589        node = &path->p_node[i];
590
591        brelse(node->bh);
592        node->bh = NULL;
593        node->el = NULL;
594    }
595
596    /*
597     * Tree depth may change during truncate, or insert. If we're
598     * keeping the root extent list, then make sure that our path
599     * structure reflects the proper depth.
600     */
601    if (keep_root)
602        depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
603    else
604        path_root_access(path) = NULL;
605
606    path->p_tree_depth = depth;
607}
608
609void ocfs2_free_path(struct ocfs2_path *path)
610{
611    if (path) {
612        ocfs2_reinit_path(path, 0);
613        kfree(path);
614    }
615}
616
617/*
618 * All the elements of src into dest. After this call, src could be freed
619 * without affecting dest.
620 *
621 * Both paths should have the same root. Any non-root elements of dest
622 * will be freed.
623 */
624static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
625{
626    int i;
627
628    BUG_ON(path_root_bh(dest) != path_root_bh(src));
629    BUG_ON(path_root_el(dest) != path_root_el(src));
630    BUG_ON(path_root_access(dest) != path_root_access(src));
631
632    ocfs2_reinit_path(dest, 1);
633
634    for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
635        dest->p_node[i].bh = src->p_node[i].bh;
636        dest->p_node[i].el = src->p_node[i].el;
637
638        if (dest->p_node[i].bh)
639            get_bh(dest->p_node[i].bh);
640    }
641}
642
643/*
644 * Make the *dest path the same as src and re-initialize src path to
645 * have a root only.
646 */
647static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
648{
649    int i;
650
651    BUG_ON(path_root_bh(dest) != path_root_bh(src));
652    BUG_ON(path_root_access(dest) != path_root_access(src));
653
654    for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
655        brelse(dest->p_node[i].bh);
656
657        dest->p_node[i].bh = src->p_node[i].bh;
658        dest->p_node[i].el = src->p_node[i].el;
659
660        src->p_node[i].bh = NULL;
661        src->p_node[i].el = NULL;
662    }
663}
664
665/*
666 * Insert an extent block at given index.
667 *
668 * This will not take an additional reference on eb_bh.
669 */
670static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
671                    struct buffer_head *eb_bh)
672{
673    struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
674
675    /*
676     * Right now, no root bh is an extent block, so this helps
677     * catch code errors with dinode trees. The assertion can be
678     * safely removed if we ever need to insert extent block
679     * structures at the root.
680     */
681    BUG_ON(index == 0);
682
683    path->p_node[index].bh = eb_bh;
684    path->p_node[index].el = &eb->h_list;
685}
686
687static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
688                     struct ocfs2_extent_list *root_el,
689                     ocfs2_journal_access_func access)
690{
691    struct ocfs2_path *path;
692
693    BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
694
695    path = kzalloc(sizeof(*path), GFP_NOFS);
696    if (path) {
697        path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
698        get_bh(root_bh);
699        path_root_bh(path) = root_bh;
700        path_root_el(path) = root_el;
701        path_root_access(path) = access;
702    }
703
704    return path;
705}
706
707struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
708{
709    return ocfs2_new_path(path_root_bh(path), path_root_el(path),
710                  path_root_access(path));
711}
712
713struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
714{
715    return ocfs2_new_path(et->et_root_bh, et->et_root_el,
716                  et->et_root_journal_access);
717}
718
719/*
720 * Journal the buffer at depth idx. All idx>0 are extent_blocks,
721 * otherwise it's the root_access function.
722 *
723 * I don't like the way this function's name looks next to
724 * ocfs2_journal_access_path(), but I don't have a better one.
725 */
726int ocfs2_path_bh_journal_access(handle_t *handle,
727                 struct ocfs2_caching_info *ci,
728                 struct ocfs2_path *path,
729                 int idx)
730{
731    ocfs2_journal_access_func access = path_root_access(path);
732
733    if (!access)
734        access = ocfs2_journal_access;
735
736    if (idx)
737        access = ocfs2_journal_access_eb;
738
739    return access(handle, ci, path->p_node[idx].bh,
740              OCFS2_JOURNAL_ACCESS_WRITE);
741}
742
743/*
744 * Convenience function to journal all components in a path.
745 */
746int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
747                  handle_t *handle,
748                  struct ocfs2_path *path)
749{
750    int i, ret = 0;
751
752    if (!path)
753        goto out;
754
755    for(i = 0; i < path_num_items(path); i++) {
756        ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
757        if (ret < 0) {
758            mlog_errno(ret);
759            goto out;
760        }
761    }
762
763out:
764    return ret;
765}
766
767/*
768 * Return the index of the extent record which contains cluster #v_cluster.
769 * -1 is returned if it was not found.
770 *
771 * Should work fine on interior and exterior nodes.
772 */
773int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
774{
775    int ret = -1;
776    int i;
777    struct ocfs2_extent_rec *rec;
778    u32 rec_end, rec_start, clusters;
779
780    for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
781        rec = &el->l_recs[i];
782
783        rec_start = le32_to_cpu(rec->e_cpos);
784        clusters = ocfs2_rec_clusters(el, rec);
785
786        rec_end = rec_start + clusters;
787
788        if (v_cluster >= rec_start && v_cluster < rec_end) {
789            ret = i;
790            break;
791        }
792    }
793
794    return ret;
795}
796
797/*
798 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
799 * ocfs2_extent_rec_contig only work properly against leaf nodes!
800 */
801static int ocfs2_block_extent_contig(struct super_block *sb,
802                     struct ocfs2_extent_rec *ext,
803                     u64 blkno)
804{
805    u64 blk_end = le64_to_cpu(ext->e_blkno);
806
807    blk_end += ocfs2_clusters_to_blocks(sb,
808                    le16_to_cpu(ext->e_leaf_clusters));
809
810    return blkno == blk_end;
811}
812
813static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
814                  struct ocfs2_extent_rec *right)
815{
816    u32 left_range;
817
818    left_range = le32_to_cpu(left->e_cpos) +
819        le16_to_cpu(left->e_leaf_clusters);
820
821    return (left_range == le32_to_cpu(right->e_cpos));
822}
823
824static enum ocfs2_contig_type
825    ocfs2_extent_rec_contig(struct super_block *sb,
826                struct ocfs2_extent_rec *ext,
827                struct ocfs2_extent_rec *insert_rec)
828{
829    u64 blkno = le64_to_cpu(insert_rec->e_blkno);
830
831    /*
832     * Refuse to coalesce extent records with different flag
833     * fields - we don't want to mix unwritten extents with user
834     * data.
835     */
836    if (ext->e_flags != insert_rec->e_flags)
837        return CONTIG_NONE;
838
839    if (ocfs2_extents_adjacent(ext, insert_rec) &&
840        ocfs2_block_extent_contig(sb, ext, blkno))
841            return CONTIG_RIGHT;
842
843    blkno = le64_to_cpu(ext->e_blkno);
844    if (ocfs2_extents_adjacent(insert_rec, ext) &&
845        ocfs2_block_extent_contig(sb, insert_rec, blkno))
846        return CONTIG_LEFT;
847
848    return CONTIG_NONE;
849}
850
851/*
852 * NOTE: We can have pretty much any combination of contiguousness and
853 * appending.
854 *
855 * The usefulness of APPEND_TAIL is more in that it lets us know that
856 * we'll have to update the path to that leaf.
857 */
858enum ocfs2_append_type {
859    APPEND_NONE = 0,
860    APPEND_TAIL,
861};
862
863enum ocfs2_split_type {
864    SPLIT_NONE = 0,
865    SPLIT_LEFT,
866    SPLIT_RIGHT,
867};
868
869struct ocfs2_insert_type {
870    enum ocfs2_split_type ins_split;
871    enum ocfs2_append_type ins_appending;
872    enum ocfs2_contig_type ins_contig;
873    int ins_contig_index;
874    int ins_tree_depth;
875};
876
877struct ocfs2_merge_ctxt {
878    enum ocfs2_contig_type c_contig_type;
879    int c_has_empty_extent;
880    int c_split_covers_rec;
881};
882
883static int ocfs2_validate_extent_block(struct super_block *sb,
884                       struct buffer_head *bh)
885{
886    int rc;
887    struct ocfs2_extent_block *eb =
888        (struct ocfs2_extent_block *)bh->b_data;
889
890    trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
891
892    BUG_ON(!buffer_uptodate(bh));
893
894    /*
895     * If the ecc fails, we return the error but otherwise
896     * leave the filesystem running. We know any error is
897     * local to this block.
898     */
899    rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
900    if (rc) {
901        mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
902             (unsigned long long)bh->b_blocknr);
903        return rc;
904    }
905
906    /*
907     * Errors after here are fatal.
908     */
909
910    if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
911        ocfs2_error(sb,
912                "Extent block #%llu has bad signature %.*s",
913                (unsigned long long)bh->b_blocknr, 7,
914                eb->h_signature);
915        return -EINVAL;
916    }
917
918    if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
919        ocfs2_error(sb,
920                "Extent block #%llu has an invalid h_blkno "
921                "of %llu",
922                (unsigned long long)bh->b_blocknr,
923                (unsigned long long)le64_to_cpu(eb->h_blkno));
924        return -EINVAL;
925    }
926
927    if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
928        ocfs2_error(sb,
929                "Extent block #%llu has an invalid "
930                "h_fs_generation of #%u",
931                (unsigned long long)bh->b_blocknr,
932                le32_to_cpu(eb->h_fs_generation));
933        return -EINVAL;
934    }
935
936    return 0;
937}
938
939int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
940                struct buffer_head **bh)
941{
942    int rc;
943    struct buffer_head *tmp = *bh;
944
945    rc = ocfs2_read_block(ci, eb_blkno, &tmp,
946                  ocfs2_validate_extent_block);
947
948    /* If ocfs2_read_block() got us a new bh, pass it up. */
949    if (!rc && !*bh)
950        *bh = tmp;
951
952    return rc;
953}
954
955
956/*
957 * How many free extents have we got before we need more meta data?
958 */
959int ocfs2_num_free_extents(struct ocfs2_super *osb,
960               struct ocfs2_extent_tree *et)
961{
962    int retval;
963    struct ocfs2_extent_list *el = NULL;
964    struct ocfs2_extent_block *eb;
965    struct buffer_head *eb_bh = NULL;
966    u64 last_eb_blk = 0;
967
968    el = et->et_root_el;
969    last_eb_blk = ocfs2_et_get_last_eb_blk(et);
970
971    if (last_eb_blk) {
972        retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
973                         &eb_bh);
974        if (retval < 0) {
975            mlog_errno(retval);
976            goto bail;
977        }
978        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
979        el = &eb->h_list;
980    }
981
982    BUG_ON(el->l_tree_depth != 0);
983
984    retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
985bail:
986    brelse(eb_bh);
987
988    trace_ocfs2_num_free_extents(retval);
989    return retval;
990}
991
992/* expects array to already be allocated
993 *
994 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
995 * l_count for you
996 */
997static int ocfs2_create_new_meta_bhs(handle_t *handle,
998                     struct ocfs2_extent_tree *et,
999                     int wanted,
1000                     struct ocfs2_alloc_context *meta_ac,
1001                     struct buffer_head *bhs[])
1002{
1003    int count, status, i;
1004    u16 suballoc_bit_start;
1005    u32 num_got;
1006    u64 suballoc_loc, first_blkno;
1007    struct ocfs2_super *osb =
1008        OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1009    struct ocfs2_extent_block *eb;
1010
1011    count = 0;
1012    while (count < wanted) {
1013        status = ocfs2_claim_metadata(handle,
1014                          meta_ac,
1015                          wanted - count,
1016                          &suballoc_loc,
1017                          &suballoc_bit_start,
1018                          &num_got,
1019                          &first_blkno);
1020        if (status < 0) {
1021            mlog_errno(status);
1022            goto bail;
1023        }
1024
1025        for(i = count; i < (num_got + count); i++) {
1026            bhs[i] = sb_getblk(osb->sb, first_blkno);
1027            if (bhs[i] == NULL) {
1028                status = -EIO;
1029                mlog_errno(status);
1030                goto bail;
1031            }
1032            ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
1033
1034            status = ocfs2_journal_access_eb(handle, et->et_ci,
1035                             bhs[i],
1036                             OCFS2_JOURNAL_ACCESS_CREATE);
1037            if (status < 0) {
1038                mlog_errno(status);
1039                goto bail;
1040            }
1041
1042            memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
1043            eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
1044            /* Ok, setup the minimal stuff here. */
1045            strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1046            eb->h_blkno = cpu_to_le64(first_blkno);
1047            eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1048            eb->h_suballoc_slot =
1049                cpu_to_le16(meta_ac->ac_alloc_slot);
1050            eb->h_suballoc_loc = cpu_to_le64(suballoc_loc);
1051            eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1052            eb->h_list.l_count =
1053                cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1054
1055            suballoc_bit_start++;
1056            first_blkno++;
1057
1058            /* We'll also be dirtied by the caller, so
1059             * this isn't absolutely necessary. */
1060            ocfs2_journal_dirty(handle, bhs[i]);
1061        }
1062
1063        count += num_got;
1064    }
1065
1066    status = 0;
1067bail:
1068    if (status < 0) {
1069        for(i = 0; i < wanted; i++) {
1070            brelse(bhs[i]);
1071            bhs[i] = NULL;
1072        }
1073        mlog_errno(status);
1074    }
1075    return status;
1076}
1077
1078/*
1079 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1080 *
1081 * Returns the sum of the rightmost extent rec logical offset and
1082 * cluster count.
1083 *
1084 * ocfs2_add_branch() uses this to determine what logical cluster
1085 * value should be populated into the leftmost new branch records.
1086 *
1087 * ocfs2_shift_tree_depth() uses this to determine the # clusters
1088 * value for the new topmost tree record.
1089 */
1090static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
1091{
1092    int i;
1093
1094    i = le16_to_cpu(el->l_next_free_rec) - 1;
1095
1096    return le32_to_cpu(el->l_recs[i].e_cpos) +
1097        ocfs2_rec_clusters(el, &el->l_recs[i]);
1098}
1099
1100/*
1101 * Change range of the branches in the right most path according to the leaf
1102 * extent block's rightmost record.
1103 */
1104static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1105                     struct ocfs2_extent_tree *et)
1106{
1107    int status;
1108    struct ocfs2_path *path = NULL;
1109    struct ocfs2_extent_list *el;
1110    struct ocfs2_extent_rec *rec;
1111
1112    path = ocfs2_new_path_from_et(et);
1113    if (!path) {
1114        status = -ENOMEM;
1115        return status;
1116    }
1117
1118    status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1119    if (status < 0) {
1120        mlog_errno(status);
1121        goto out;
1122    }
1123
1124    status = ocfs2_extend_trans(handle, path_num_items(path));
1125    if (status < 0) {
1126        mlog_errno(status);
1127        goto out;
1128    }
1129
1130    status = ocfs2_journal_access_path(et->et_ci, handle, path);
1131    if (status < 0) {
1132        mlog_errno(status);
1133        goto out;
1134    }
1135
1136    el = path_leaf_el(path);
1137    rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1138
1139    ocfs2_adjust_rightmost_records(handle, et, path, rec);
1140
1141out:
1142    ocfs2_free_path(path);
1143    return status;
1144}
1145
1146/*
1147 * Add an entire tree branch to our inode. eb_bh is the extent block
1148 * to start at, if we don't want to start the branch at the root
1149 * structure.
1150 *
1151 * last_eb_bh is required as we have to update it's next_leaf pointer
1152 * for the new last extent block.
1153 *
1154 * the new branch will be 'empty' in the sense that every block will
1155 * contain a single record with cluster count == 0.
1156 */
1157static int ocfs2_add_branch(handle_t *handle,
1158                struct ocfs2_extent_tree *et,
1159                struct buffer_head *eb_bh,
1160                struct buffer_head **last_eb_bh,
1161                struct ocfs2_alloc_context *meta_ac)
1162{
1163    int status, new_blocks, i;
1164    u64 next_blkno, new_last_eb_blk;
1165    struct buffer_head *bh;
1166    struct buffer_head **new_eb_bhs = NULL;
1167    struct ocfs2_extent_block *eb;
1168    struct ocfs2_extent_list *eb_el;
1169    struct ocfs2_extent_list *el;
1170    u32 new_cpos, root_end;
1171
1172    BUG_ON(!last_eb_bh || !*last_eb_bh);
1173
1174    if (eb_bh) {
1175        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1176        el = &eb->h_list;
1177    } else
1178        el = et->et_root_el;
1179
1180    /* we never add a branch to a leaf. */
1181    BUG_ON(!el->l_tree_depth);
1182
1183    new_blocks = le16_to_cpu(el->l_tree_depth);
1184
1185    eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1186    new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1187    root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1188
1189    /*
1190     * If there is a gap before the root end and the real end
1191     * of the righmost leaf block, we need to remove the gap
1192     * between new_cpos and root_end first so that the tree
1193     * is consistent after we add a new branch(it will start
1194     * from new_cpos).
1195     */
1196    if (root_end > new_cpos) {
1197        trace_ocfs2_adjust_rightmost_branch(
1198            (unsigned long long)
1199            ocfs2_metadata_cache_owner(et->et_ci),
1200            root_end, new_cpos);
1201
1202        status = ocfs2_adjust_rightmost_branch(handle, et);
1203        if (status) {
1204            mlog_errno(status);
1205            goto bail;
1206        }
1207    }
1208
1209    /* allocate the number of new eb blocks we need */
1210    new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1211                 GFP_KERNEL);
1212    if (!new_eb_bhs) {
1213        status = -ENOMEM;
1214        mlog_errno(status);
1215        goto bail;
1216    }
1217
1218    status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1219                       meta_ac, new_eb_bhs);
1220    if (status < 0) {
1221        mlog_errno(status);
1222        goto bail;
1223    }
1224
1225    /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1226     * linked with the rest of the tree.
1227     * conversly, new_eb_bhs[0] is the new bottommost leaf.
1228     *
1229     * when we leave the loop, new_last_eb_blk will point to the
1230     * newest leaf, and next_blkno will point to the topmost extent
1231     * block. */
1232    next_blkno = new_last_eb_blk = 0;
1233    for(i = 0; i < new_blocks; i++) {
1234        bh = new_eb_bhs[i];
1235        eb = (struct ocfs2_extent_block *) bh->b_data;
1236        /* ocfs2_create_new_meta_bhs() should create it right! */
1237        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1238        eb_el = &eb->h_list;
1239
1240        status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1241                         OCFS2_JOURNAL_ACCESS_CREATE);
1242        if (status < 0) {
1243            mlog_errno(status);
1244            goto bail;
1245        }
1246
1247        eb->h_next_leaf_blk = 0;
1248        eb_el->l_tree_depth = cpu_to_le16(i);
1249        eb_el->l_next_free_rec = cpu_to_le16(1);
1250        /*
1251         * This actually counts as an empty extent as
1252         * c_clusters == 0
1253         */
1254        eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1255        eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1256        /*
1257         * eb_el isn't always an interior node, but even leaf
1258         * nodes want a zero'd flags and reserved field so
1259         * this gets the whole 32 bits regardless of use.
1260         */
1261        eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1262        if (!eb_el->l_tree_depth)
1263            new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1264
1265        ocfs2_journal_dirty(handle, bh);
1266        next_blkno = le64_to_cpu(eb->h_blkno);
1267    }
1268
1269    /* This is a bit hairy. We want to update up to three blocks
1270     * here without leaving any of them in an inconsistent state
1271     * in case of error. We don't have to worry about
1272     * journal_dirty erroring as it won't unless we've aborted the
1273     * handle (in which case we would never be here) so reserving
1274     * the write with journal_access is all we need to do. */
1275    status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1276                     OCFS2_JOURNAL_ACCESS_WRITE);
1277    if (status < 0) {
1278        mlog_errno(status);
1279        goto bail;
1280    }
1281    status = ocfs2_et_root_journal_access(handle, et,
1282                          OCFS2_JOURNAL_ACCESS_WRITE);
1283    if (status < 0) {
1284        mlog_errno(status);
1285        goto bail;
1286    }
1287    if (eb_bh) {
1288        status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1289                         OCFS2_JOURNAL_ACCESS_WRITE);
1290        if (status < 0) {
1291            mlog_errno(status);
1292            goto bail;
1293        }
1294    }
1295
1296    /* Link the new branch into the rest of the tree (el will
1297     * either be on the root_bh, or the extent block passed in. */
1298    i = le16_to_cpu(el->l_next_free_rec);
1299    el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1300    el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1301    el->l_recs[i].e_int_clusters = 0;
1302    le16_add_cpu(&el->l_next_free_rec, 1);
1303
1304    /* fe needs a new last extent block pointer, as does the
1305     * next_leaf on the previously last-extent-block. */
1306    ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1307
1308    eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1309    eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1310
1311    ocfs2_journal_dirty(handle, *last_eb_bh);
1312    ocfs2_journal_dirty(handle, et->et_root_bh);
1313    if (eb_bh)
1314        ocfs2_journal_dirty(handle, eb_bh);
1315
1316    /*
1317     * Some callers want to track the rightmost leaf so pass it
1318     * back here.
1319     */
1320    brelse(*last_eb_bh);
1321    get_bh(new_eb_bhs[0]);
1322    *last_eb_bh = new_eb_bhs[0];
1323
1324    status = 0;
1325bail:
1326    if (new_eb_bhs) {
1327        for (i = 0; i < new_blocks; i++)
1328            brelse(new_eb_bhs[i]);
1329        kfree(new_eb_bhs);
1330    }
1331
1332    return status;
1333}
1334
1335/*
1336 * adds another level to the allocation tree.
1337 * returns back the new extent block so you can add a branch to it
1338 * after this call.
1339 */
1340static int ocfs2_shift_tree_depth(handle_t *handle,
1341                  struct ocfs2_extent_tree *et,
1342                  struct ocfs2_alloc_context *meta_ac,
1343                  struct buffer_head **ret_new_eb_bh)
1344{
1345    int status, i;
1346    u32 new_clusters;
1347    struct buffer_head *new_eb_bh = NULL;
1348    struct ocfs2_extent_block *eb;
1349    struct ocfs2_extent_list *root_el;
1350    struct ocfs2_extent_list *eb_el;
1351
1352    status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1353                       &new_eb_bh);
1354    if (status < 0) {
1355        mlog_errno(status);
1356        goto bail;
1357    }
1358
1359    eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1360    /* ocfs2_create_new_meta_bhs() should create it right! */
1361    BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1362
1363    eb_el = &eb->h_list;
1364    root_el = et->et_root_el;
1365
1366    status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1367                     OCFS2_JOURNAL_ACCESS_CREATE);
1368    if (status < 0) {
1369        mlog_errno(status);
1370        goto bail;
1371    }
1372
1373    /* copy the root extent list data into the new extent block */
1374    eb_el->l_tree_depth = root_el->l_tree_depth;
1375    eb_el->l_next_free_rec = root_el->l_next_free_rec;
1376    for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1377        eb_el->l_recs[i] = root_el->l_recs[i];
1378
1379    ocfs2_journal_dirty(handle, new_eb_bh);
1380
1381    status = ocfs2_et_root_journal_access(handle, et,
1382                          OCFS2_JOURNAL_ACCESS_WRITE);
1383    if (status < 0) {
1384        mlog_errno(status);
1385        goto bail;
1386    }
1387
1388    new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1389
1390    /* update root_bh now */
1391    le16_add_cpu(&root_el->l_tree_depth, 1);
1392    root_el->l_recs[0].e_cpos = 0;
1393    root_el->l_recs[0].e_blkno = eb->h_blkno;
1394    root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1395    for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1396        memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1397    root_el->l_next_free_rec = cpu_to_le16(1);
1398
1399    /* If this is our 1st tree depth shift, then last_eb_blk
1400     * becomes the allocated extent block */
1401    if (root_el->l_tree_depth == cpu_to_le16(1))
1402        ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1403
1404    ocfs2_journal_dirty(handle, et->et_root_bh);
1405
1406    *ret_new_eb_bh = new_eb_bh;
1407    new_eb_bh = NULL;
1408    status = 0;
1409bail:
1410    brelse(new_eb_bh);
1411
1412    return status;
1413}
1414
1415/*
1416 * Should only be called when there is no space left in any of the
1417 * leaf nodes. What we want to do is find the lowest tree depth
1418 * non-leaf extent block with room for new records. There are three
1419 * valid results of this search:
1420 *
1421 * 1) a lowest extent block is found, then we pass it back in
1422 * *lowest_eb_bh and return '0'
1423 *
1424 * 2) the search fails to find anything, but the root_el has room. We
1425 * pass NULL back in *lowest_eb_bh, but still return '0'
1426 *
1427 * 3) the search fails to find anything AND the root_el is full, in
1428 * which case we return > 0
1429 *
1430 * return status < 0 indicates an error.
1431 */
1432static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1433                    struct buffer_head **target_bh)
1434{
1435    int status = 0, i;
1436    u64 blkno;
1437    struct ocfs2_extent_block *eb;
1438    struct ocfs2_extent_list *el;
1439    struct buffer_head *bh = NULL;
1440    struct buffer_head *lowest_bh = NULL;
1441
1442    *target_bh = NULL;
1443
1444    el = et->et_root_el;
1445
1446    while(le16_to_cpu(el->l_tree_depth) > 1) {
1447        if (le16_to_cpu(el->l_next_free_rec) == 0) {
1448            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1449                    "Owner %llu has empty "
1450                    "extent list (next_free_rec == 0)",
1451                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1452            status = -EIO;
1453            goto bail;
1454        }
1455        i = le16_to_cpu(el->l_next_free_rec) - 1;
1456        blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1457        if (!blkno) {
1458            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1459                    "Owner %llu has extent "
1460                    "list where extent # %d has no physical "
1461                    "block start",
1462                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1463            status = -EIO;
1464            goto bail;
1465        }
1466
1467        brelse(bh);
1468        bh = NULL;
1469
1470        status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1471        if (status < 0) {
1472            mlog_errno(status);
1473            goto bail;
1474        }
1475
1476        eb = (struct ocfs2_extent_block *) bh->b_data;
1477        el = &eb->h_list;
1478
1479        if (le16_to_cpu(el->l_next_free_rec) <
1480            le16_to_cpu(el->l_count)) {
1481            brelse(lowest_bh);
1482            lowest_bh = bh;
1483            get_bh(lowest_bh);
1484        }
1485    }
1486
1487    /* If we didn't find one and the fe doesn't have any room,
1488     * then return '1' */
1489    el = et->et_root_el;
1490    if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1491        status = 1;
1492
1493    *target_bh = lowest_bh;
1494bail:
1495    brelse(bh);
1496
1497    return status;
1498}
1499
1500/*
1501 * Grow a b-tree so that it has more records.
1502 *
1503 * We might shift the tree depth in which case existing paths should
1504 * be considered invalid.
1505 *
1506 * Tree depth after the grow is returned via *final_depth.
1507 *
1508 * *last_eb_bh will be updated by ocfs2_add_branch().
1509 */
1510static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1511               int *final_depth, struct buffer_head **last_eb_bh,
1512               struct ocfs2_alloc_context *meta_ac)
1513{
1514    int ret, shift;
1515    struct ocfs2_extent_list *el = et->et_root_el;
1516    int depth = le16_to_cpu(el->l_tree_depth);
1517    struct buffer_head *bh = NULL;
1518
1519    BUG_ON(meta_ac == NULL);
1520
1521    shift = ocfs2_find_branch_target(et, &bh);
1522    if (shift < 0) {
1523        ret = shift;
1524        mlog_errno(ret);
1525        goto out;
1526    }
1527
1528    /* We traveled all the way to the bottom of the allocation tree
1529     * and didn't find room for any more extents - we need to add
1530     * another tree level */
1531    if (shift) {
1532        BUG_ON(bh);
1533        trace_ocfs2_grow_tree(
1534            (unsigned long long)
1535            ocfs2_metadata_cache_owner(et->et_ci),
1536            depth);
1537
1538        /* ocfs2_shift_tree_depth will return us a buffer with
1539         * the new extent block (so we can pass that to
1540         * ocfs2_add_branch). */
1541        ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1542        if (ret < 0) {
1543            mlog_errno(ret);
1544            goto out;
1545        }
1546        depth++;
1547        if (depth == 1) {
1548            /*
1549             * Special case: we have room now if we shifted from
1550             * tree_depth 0, so no more work needs to be done.
1551             *
1552             * We won't be calling add_branch, so pass
1553             * back *last_eb_bh as the new leaf. At depth
1554             * zero, it should always be null so there's
1555             * no reason to brelse.
1556             */
1557            BUG_ON(*last_eb_bh);
1558            get_bh(bh);
1559            *last_eb_bh = bh;
1560            goto out;
1561        }
1562    }
1563
1564    /* call ocfs2_add_branch to add the final part of the tree with
1565     * the new data. */
1566    ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1567                   meta_ac);
1568    if (ret < 0) {
1569        mlog_errno(ret);
1570        goto out;
1571    }
1572
1573out:
1574    if (final_depth)
1575        *final_depth = depth;
1576    brelse(bh);
1577    return ret;
1578}
1579
1580/*
1581 * This function will discard the rightmost extent record.
1582 */
1583static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1584{
1585    int next_free = le16_to_cpu(el->l_next_free_rec);
1586    int count = le16_to_cpu(el->l_count);
1587    unsigned int num_bytes;
1588
1589    BUG_ON(!next_free);
1590    /* This will cause us to go off the end of our extent list. */
1591    BUG_ON(next_free >= count);
1592
1593    num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1594
1595    memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1596}
1597
1598static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1599                  struct ocfs2_extent_rec *insert_rec)
1600{
1601    int i, insert_index, next_free, has_empty, num_bytes;
1602    u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1603    struct ocfs2_extent_rec *rec;
1604
1605    next_free = le16_to_cpu(el->l_next_free_rec);
1606    has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1607
1608    BUG_ON(!next_free);
1609
1610    /* The tree code before us didn't allow enough room in the leaf. */
1611    BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1612
1613    /*
1614     * The easiest way to approach this is to just remove the
1615     * empty extent and temporarily decrement next_free.
1616     */
1617    if (has_empty) {
1618        /*
1619         * If next_free was 1 (only an empty extent), this
1620         * loop won't execute, which is fine. We still want
1621         * the decrement above to happen.
1622         */
1623        for(i = 0; i < (next_free - 1); i++)
1624            el->l_recs[i] = el->l_recs[i+1];
1625
1626        next_free--;
1627    }
1628
1629    /*
1630     * Figure out what the new record index should be.
1631     */
1632    for(i = 0; i < next_free; i++) {
1633        rec = &el->l_recs[i];
1634
1635        if (insert_cpos < le32_to_cpu(rec->e_cpos))
1636            break;
1637    }
1638    insert_index = i;
1639
1640    trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
1641                has_empty, next_free,
1642                le16_to_cpu(el->l_count));
1643
1644    BUG_ON(insert_index < 0);
1645    BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1646    BUG_ON(insert_index > next_free);
1647
1648    /*
1649     * No need to memmove if we're just adding to the tail.
1650     */
1651    if (insert_index != next_free) {
1652        BUG_ON(next_free >= le16_to_cpu(el->l_count));
1653
1654        num_bytes = next_free - insert_index;
1655        num_bytes *= sizeof(struct ocfs2_extent_rec);
1656        memmove(&el->l_recs[insert_index + 1],
1657            &el->l_recs[insert_index],
1658            num_bytes);
1659    }
1660
1661    /*
1662     * Either we had an empty extent, and need to re-increment or
1663     * there was no empty extent on a non full rightmost leaf node,
1664     * in which case we still need to increment.
1665     */
1666    next_free++;
1667    el->l_next_free_rec = cpu_to_le16(next_free);
1668    /*
1669     * Make sure none of the math above just messed up our tree.
1670     */
1671    BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1672
1673    el->l_recs[insert_index] = *insert_rec;
1674
1675}
1676
1677static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1678{
1679    int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1680
1681    BUG_ON(num_recs == 0);
1682
1683    if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1684        num_recs--;
1685        size = num_recs * sizeof(struct ocfs2_extent_rec);
1686        memmove(&el->l_recs[0], &el->l_recs[1], size);
1687        memset(&el->l_recs[num_recs], 0,
1688               sizeof(struct ocfs2_extent_rec));
1689        el->l_next_free_rec = cpu_to_le16(num_recs);
1690    }
1691}
1692
1693/*
1694 * Create an empty extent record .
1695 *
1696 * l_next_free_rec may be updated.
1697 *
1698 * If an empty extent already exists do nothing.
1699 */
1700static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1701{
1702    int next_free = le16_to_cpu(el->l_next_free_rec);
1703
1704    BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1705
1706    if (next_free == 0)
1707        goto set_and_inc;
1708
1709    if (ocfs2_is_empty_extent(&el->l_recs[0]))
1710        return;
1711
1712    mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1713            "Asked to create an empty extent in a full list:\n"
1714            "count = %u, tree depth = %u",
1715            le16_to_cpu(el->l_count),
1716            le16_to_cpu(el->l_tree_depth));
1717
1718    ocfs2_shift_records_right(el);
1719
1720set_and_inc:
1721    le16_add_cpu(&el->l_next_free_rec, 1);
1722    memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1723}
1724
1725/*
1726 * For a rotation which involves two leaf nodes, the "root node" is
1727 * the lowest level tree node which contains a path to both leafs. This
1728 * resulting set of information can be used to form a complete "subtree"
1729 *
1730 * This function is passed two full paths from the dinode down to a
1731 * pair of adjacent leaves. It's task is to figure out which path
1732 * index contains the subtree root - this can be the root index itself
1733 * in a worst-case rotation.
1734 *
1735 * The array index of the subtree root is passed back.
1736 */
1737int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1738                struct ocfs2_path *left,
1739                struct ocfs2_path *right)
1740{
1741    int i = 0;
1742
1743    /*
1744     * Check that the caller passed in two paths from the same tree.
1745     */
1746    BUG_ON(path_root_bh(left) != path_root_bh(right));
1747
1748    do {
1749        i++;
1750
1751        /*
1752         * The caller didn't pass two adjacent paths.
1753         */
1754        mlog_bug_on_msg(i > left->p_tree_depth,
1755                "Owner %llu, left depth %u, right depth %u\n"
1756                "left leaf blk %llu, right leaf blk %llu\n",
1757                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1758                left->p_tree_depth, right->p_tree_depth,
1759                (unsigned long long)path_leaf_bh(left)->b_blocknr,
1760                (unsigned long long)path_leaf_bh(right)->b_blocknr);
1761    } while (left->p_node[i].bh->b_blocknr ==
1762         right->p_node[i].bh->b_blocknr);
1763
1764    return i - 1;
1765}
1766
1767typedef void (path_insert_t)(void *, struct buffer_head *);
1768
1769/*
1770 * Traverse a btree path in search of cpos, starting at root_el.
1771 *
1772 * This code can be called with a cpos larger than the tree, in which
1773 * case it will return the rightmost path.
1774 */
1775static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1776                 struct ocfs2_extent_list *root_el, u32 cpos,
1777                 path_insert_t *func, void *data)
1778{
1779    int i, ret = 0;
1780    u32 range;
1781    u64 blkno;
1782    struct buffer_head *bh = NULL;
1783    struct ocfs2_extent_block *eb;
1784    struct ocfs2_extent_list *el;
1785    struct ocfs2_extent_rec *rec;
1786
1787    el = root_el;
1788    while (el->l_tree_depth) {
1789        if (le16_to_cpu(el->l_next_free_rec) == 0) {
1790            ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1791                    "Owner %llu has empty extent list at "
1792                    "depth %u\n",
1793                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1794                    le16_to_cpu(el->l_tree_depth));
1795            ret = -EROFS;
1796            goto out;
1797
1798        }
1799
1800        for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1801            rec = &el->l_recs[i];
1802
1803            /*
1804             * In the case that cpos is off the allocation
1805             * tree, this should just wind up returning the
1806             * rightmost record.
1807             */
1808            range = le32_to_cpu(rec->e_cpos) +
1809                ocfs2_rec_clusters(el, rec);
1810            if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1811                break;
1812        }
1813
1814        blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1815        if (blkno == 0) {
1816            ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1817                    "Owner %llu has bad blkno in extent list "
1818                    "at depth %u (index %d)\n",
1819                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1820                    le16_to_cpu(el->l_tree_depth), i);
1821            ret = -EROFS;
1822            goto out;
1823        }
1824
1825        brelse(bh);
1826        bh = NULL;
1827        ret = ocfs2_read_extent_block(ci, blkno, &bh);
1828        if (ret) {
1829            mlog_errno(ret);
1830            goto out;
1831        }
1832
1833        eb = (struct ocfs2_extent_block *) bh->b_data;
1834        el = &eb->h_list;
1835
1836        if (le16_to_cpu(el->l_next_free_rec) >
1837            le16_to_cpu(el->l_count)) {
1838            ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1839                    "Owner %llu has bad count in extent list "
1840                    "at block %llu (next free=%u, count=%u)\n",
1841                    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1842                    (unsigned long long)bh->b_blocknr,
1843                    le16_to_cpu(el->l_next_free_rec),
1844                    le16_to_cpu(el->l_count));
1845            ret = -EROFS;
1846            goto out;
1847        }
1848
1849        if (func)
1850            func(data, bh);
1851    }
1852
1853out:
1854    /*
1855     * Catch any trailing bh that the loop didn't handle.
1856     */
1857    brelse(bh);
1858
1859    return ret;
1860}
1861
1862/*
1863 * Given an initialized path (that is, it has a valid root extent
1864 * list), this function will traverse the btree in search of the path
1865 * which would contain cpos.
1866 *
1867 * The path traveled is recorded in the path structure.
1868 *
1869 * Note that this will not do any comparisons on leaf node extent
1870 * records, so it will work fine in the case that we just added a tree
1871 * branch.
1872 */
1873struct find_path_data {
1874    int index;
1875    struct ocfs2_path *path;
1876};
1877static void find_path_ins(void *data, struct buffer_head *bh)
1878{
1879    struct find_path_data *fp = data;
1880
1881    get_bh(bh);
1882    ocfs2_path_insert_eb(fp->path, fp->index, bh);
1883    fp->index++;
1884}
1885int ocfs2_find_path(struct ocfs2_caching_info *ci,
1886            struct ocfs2_path *path, u32 cpos)
1887{
1888    struct find_path_data data;
1889
1890    data.index = 1;
1891    data.path = path;
1892    return __ocfs2_find_path(ci, path_root_el(path), cpos,
1893                 find_path_ins, &data);
1894}
1895
1896static void find_leaf_ins(void *data, struct buffer_head *bh)
1897{
1898    struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1899    struct ocfs2_extent_list *el = &eb->h_list;
1900    struct buffer_head **ret = data;
1901
1902    /* We want to retain only the leaf block. */
1903    if (le16_to_cpu(el->l_tree_depth) == 0) {
1904        get_bh(bh);
1905        *ret = bh;
1906    }
1907}
1908/*
1909 * Find the leaf block in the tree which would contain cpos. No
1910 * checking of the actual leaf is done.
1911 *
1912 * Some paths want to call this instead of allocating a path structure
1913 * and calling ocfs2_find_path().
1914 *
1915 * This function doesn't handle non btree extent lists.
1916 */
1917int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1918            struct ocfs2_extent_list *root_el, u32 cpos,
1919            struct buffer_head **leaf_bh)
1920{
1921    int ret;
1922    struct buffer_head *bh = NULL;
1923
1924    ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1925    if (ret) {
1926        mlog_errno(ret);
1927        goto out;
1928    }
1929
1930    *leaf_bh = bh;
1931out:
1932    return ret;
1933}
1934
1935/*
1936 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1937 *
1938 * Basically, we've moved stuff around at the bottom of the tree and
1939 * we need to fix up the extent records above the changes to reflect
1940 * the new changes.
1941 *
1942 * left_rec: the record on the left.
1943 * left_child_el: is the child list pointed to by left_rec
1944 * right_rec: the record to the right of left_rec
1945 * right_child_el: is the child list pointed to by right_rec
1946 *
1947 * By definition, this only works on interior nodes.
1948 */
1949static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1950                  struct ocfs2_extent_list *left_child_el,
1951                  struct ocfs2_extent_rec *right_rec,
1952                  struct ocfs2_extent_list *right_child_el)
1953{
1954    u32 left_clusters, right_end;
1955
1956    /*
1957     * Interior nodes never have holes. Their cpos is the cpos of
1958     * the leftmost record in their child list. Their cluster
1959     * count covers the full theoretical range of their child list
1960     * - the range between their cpos and the cpos of the record
1961     * immediately to their right.
1962     */
1963    left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1964    if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1965        BUG_ON(right_child_el->l_tree_depth);
1966        BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1967        left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1968    }
1969    left_clusters -= le32_to_cpu(left_rec->e_cpos);
1970    left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1971
1972    /*
1973     * Calculate the rightmost cluster count boundary before
1974     * moving cpos - we will need to adjust clusters after
1975     * updating e_cpos to keep the same highest cluster count.
1976     */
1977    right_end = le32_to_cpu(right_rec->e_cpos);
1978    right_end += le32_to_cpu(right_rec->e_int_clusters);
1979
1980    right_rec->e_cpos = left_rec->e_cpos;
1981    le32_add_cpu(&right_rec->e_cpos, left_clusters);
1982
1983    right_end -= le32_to_cpu(right_rec->e_cpos);
1984    right_rec->e_int_clusters = cpu_to_le32(right_end);
1985}
1986
1987/*
1988 * Adjust the adjacent root node records involved in a
1989 * rotation. left_el_blkno is passed in as a key so that we can easily
1990 * find it's index in the root list.
1991 */
1992static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1993                      struct ocfs2_extent_list *left_el,
1994                      struct ocfs2_extent_list *right_el,
1995                      u64 left_el_blkno)
1996{
1997    int i;
1998
1999    BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
2000           le16_to_cpu(left_el->l_tree_depth));
2001
2002    for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
2003        if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
2004            break;
2005    }
2006
2007    /*
2008     * The path walking code should have never returned a root and
2009     * two paths which are not adjacent.
2010     */
2011    BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2012
2013    ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
2014                      &root_el->l_recs[i + 1], right_el);
2015}
2016
2017/*
2018 * We've changed a leaf block (in right_path) and need to reflect that
2019 * change back up the subtree.
2020 *
2021 * This happens in multiple places:
2022 * - When we've moved an extent record from the left path leaf to the right
2023 * path leaf to make room for an empty extent in the left path leaf.
2024 * - When our insert into the right path leaf is at the leftmost edge
2025 * and requires an update of the path immediately to it's left. This
2026 * can occur at the end of some types of rotation and appending inserts.
2027 * - When we've adjusted the last extent record in the left path leaf and the
2028 * 1st extent record in the right path leaf during cross extent block merge.
2029 */
2030static void ocfs2_complete_edge_insert(handle_t *handle,
2031                       struct ocfs2_path *left_path,
2032                       struct ocfs2_path *right_path,
2033                       int subtree_index)
2034{
2035    int i, idx;
2036    struct ocfs2_extent_list *el, *left_el, *right_el;
2037    struct ocfs2_extent_rec *left_rec, *right_rec;
2038    struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2039
2040    /*
2041     * Update the counts and position values within all the
2042     * interior nodes to reflect the leaf rotation we just did.
2043     *
2044     * The root node is handled below the loop.
2045     *
2046     * We begin the loop with right_el and left_el pointing to the
2047     * leaf lists and work our way up.
2048     *
2049     * NOTE: within this loop, left_el and right_el always refer
2050     * to the *child* lists.
2051     */
2052    left_el = path_leaf_el(left_path);
2053    right_el = path_leaf_el(right_path);
2054    for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2055        trace_ocfs2_complete_edge_insert(i);
2056
2057        /*
2058         * One nice property of knowing that all of these
2059         * nodes are below the root is that we only deal with
2060         * the leftmost right node record and the rightmost
2061         * left node record.
2062         */
2063        el = left_path->p_node[i].el;
2064        idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2065        left_rec = &el->l_recs[idx];
2066
2067        el = right_path->p_node[i].el;
2068        right_rec = &el->l_recs[0];
2069
2070        ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2071                          right_el);
2072
2073        ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2074        ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2075
2076        /*
2077         * Setup our list pointers now so that the current
2078         * parents become children in the next iteration.
2079         */
2080        left_el = left_path->p_node[i].el;
2081        right_el = right_path->p_node[i].el;
2082    }
2083
2084    /*
2085     * At the root node, adjust the two adjacent records which
2086     * begin our path to the leaves.
2087     */
2088
2089    el = left_path->p_node[subtree_index].el;
2090    left_el = left_path->p_node[subtree_index + 1].el;
2091    right_el = right_path->p_node[subtree_index + 1].el;
2092
2093    ocfs2_adjust_root_records(el, left_el, right_el,
2094                  left_path->p_node[subtree_index + 1].bh->b_blocknr);
2095
2096    root_bh = left_path->p_node[subtree_index].bh;
2097
2098    ocfs2_journal_dirty(handle, root_bh);
2099}
2100
2101static int ocfs2_rotate_subtree_right(handle_t *handle,
2102                      struct ocfs2_extent_tree *et,
2103                      struct ocfs2_path *left_path,
2104                      struct ocfs2_path *right_path,
2105                      int subtree_index)
2106{
2107    int ret, i;
2108    struct buffer_head *right_leaf_bh;
2109    struct buffer_head *left_leaf_bh = NULL;
2110    struct buffer_head *root_bh;
2111    struct ocfs2_extent_list *right_el, *left_el;
2112    struct ocfs2_extent_rec move_rec;
2113
2114    left_leaf_bh = path_leaf_bh(left_path);
2115    left_el = path_leaf_el(left_path);
2116
2117    if (left_el->l_next_free_rec != left_el->l_count) {
2118        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2119                "Inode %llu has non-full interior leaf node %llu"
2120                "(next free = %u)",
2121                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2122                (unsigned long long)left_leaf_bh->b_blocknr,
2123                le16_to_cpu(left_el->l_next_free_rec));
2124        return -EROFS;
2125    }
2126
2127    /*
2128     * This extent block may already have an empty record, so we
2129     * return early if so.
2130     */
2131    if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2132        return 0;
2133
2134    root_bh = left_path->p_node[subtree_index].bh;
2135    BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2136
2137    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2138                       subtree_index);
2139    if (ret) {
2140        mlog_errno(ret);
2141        goto out;
2142    }
2143
2144    for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2145        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2146                           right_path, i);
2147        if (ret) {
2148            mlog_errno(ret);
2149            goto out;
2150        }
2151
2152        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2153                           left_path, i);
2154        if (ret) {
2155            mlog_errno(ret);
2156            goto out;
2157        }
2158    }
2159
2160    right_leaf_bh = path_leaf_bh(right_path);
2161    right_el = path_leaf_el(right_path);
2162
2163    /* This is a code error, not a disk corruption. */
2164    mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2165            "because rightmost leaf block %llu is empty\n",
2166            (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2167            (unsigned long long)right_leaf_bh->b_blocknr);
2168
2169    ocfs2_create_empty_extent(right_el);
2170
2171    ocfs2_journal_dirty(handle, right_leaf_bh);
2172
2173    /* Do the copy now. */
2174    i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2175    move_rec = left_el->l_recs[i];
2176    right_el->l_recs[0] = move_rec;
2177
2178    /*
2179     * Clear out the record we just copied and shift everything
2180     * over, leaving an empty extent in the left leaf.
2181     *
2182     * We temporarily subtract from next_free_rec so that the
2183     * shift will lose the tail record (which is now defunct).
2184     */
2185    le16_add_cpu(&left_el->l_next_free_rec, -1);
2186    ocfs2_shift_records_right(left_el);
2187    memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2188    le16_add_cpu(&left_el->l_next_free_rec, 1);
2189
2190    ocfs2_journal_dirty(handle, left_leaf_bh);
2191
2192    ocfs2_complete_edge_insert(handle, left_path, right_path,
2193                   subtree_index);
2194
2195out:
2196    return ret;
2197}
2198
2199/*
2200 * Given a full path, determine what cpos value would return us a path
2201 * containing the leaf immediately to the left of the current one.
2202 *
2203 * Will return zero if the path passed in is already the leftmost path.
2204 */
2205int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2206                  struct ocfs2_path *path, u32 *cpos)
2207{
2208    int i, j, ret = 0;
2209    u64 blkno;
2210    struct ocfs2_extent_list *el;
2211
2212    BUG_ON(path->p_tree_depth == 0);
2213
2214    *cpos = 0;
2215
2216    blkno = path_leaf_bh(path)->b_blocknr;
2217
2218    /* Start at the tree node just above the leaf and work our way up. */
2219    i = path->p_tree_depth - 1;
2220    while (i >= 0) {
2221        el = path->p_node[i].el;
2222
2223        /*
2224         * Find the extent record just before the one in our
2225         * path.
2226         */
2227        for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2228            if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2229                if (j == 0) {
2230                    if (i == 0) {
2231                        /*
2232                         * We've determined that the
2233                         * path specified is already
2234                         * the leftmost one - return a
2235                         * cpos of zero.
2236                         */
2237                        goto out;
2238                    }
2239                    /*
2240                     * The leftmost record points to our
2241                     * leaf - we need to travel up the
2242                     * tree one level.
2243                     */
2244                    goto next_node;
2245                }
2246
2247                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2248                *cpos = *cpos + ocfs2_rec_clusters(el,
2249                               &el->l_recs[j - 1]);
2250                *cpos = *cpos - 1;
2251                goto out;
2252            }
2253        }
2254
2255        /*
2256         * If we got here, we never found a valid node where
2257         * the tree indicated one should be.
2258         */
2259        ocfs2_error(sb,
2260                "Invalid extent tree at extent block %llu\n",
2261                (unsigned long long)blkno);
2262        ret = -EROFS;
2263        goto out;
2264
2265next_node:
2266        blkno = path->p_node[i].bh->b_blocknr;
2267        i--;
2268    }
2269
2270out:
2271    return ret;
2272}
2273
2274/*
2275 * Extend the transaction by enough credits to complete the rotation,
2276 * and still leave at least the original number of credits allocated
2277 * to this transaction.
2278 */
2279static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2280                       int op_credits,
2281                       struct ocfs2_path *path)
2282{
2283    int ret = 0;
2284    int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2285
2286    if (handle->h_buffer_credits < credits)
2287        ret = ocfs2_extend_trans(handle,
2288                     credits - handle->h_buffer_credits);
2289
2290    return ret;
2291}
2292
2293/*
2294 * Trap the case where we're inserting into the theoretical range past
2295 * the _actual_ left leaf range. Otherwise, we'll rotate a record
2296 * whose cpos is less than ours into the right leaf.
2297 *
2298 * It's only necessary to look at the rightmost record of the left
2299 * leaf because the logic that calls us should ensure that the
2300 * theoretical ranges in the path components above the leaves are
2301 * correct.
2302 */
2303static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2304                         u32 insert_cpos)
2305{
2306    struct ocfs2_extent_list *left_el;
2307    struct ocfs2_extent_rec *rec;
2308    int next_free;
2309
2310    left_el = path_leaf_el(left_path);
2311    next_free = le16_to_cpu(left_el->l_next_free_rec);
2312    rec = &left_el->l_recs[next_free - 1];
2313
2314    if (insert_cpos > le32_to_cpu(rec->e_cpos))
2315        return 1;
2316    return 0;
2317}
2318
2319static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2320{
2321    int next_free = le16_to_cpu(el->l_next_free_rec);
2322    unsigned int range;
2323    struct ocfs2_extent_rec *rec;
2324
2325    if (next_free == 0)
2326        return 0;
2327
2328    rec = &el->l_recs[0];
2329    if (ocfs2_is_empty_extent(rec)) {
2330        /* Empty list. */
2331        if (next_free == 1)
2332            return 0;
2333        rec = &el->l_recs[1];
2334    }
2335
2336    range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2337    if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2338        return 1;
2339    return 0;
2340}
2341
2342/*
2343 * Rotate all the records in a btree right one record, starting at insert_cpos.
2344 *
2345 * The path to the rightmost leaf should be passed in.
2346 *
2347 * The array is assumed to be large enough to hold an entire path (tree depth).
2348 *
2349 * Upon successful return from this function:
2350 *
2351 * - The 'right_path' array will contain a path to the leaf block
2352 * whose range contains e_cpos.
2353 * - That leaf block will have a single empty extent in list index 0.
2354 * - In the case that the rotation requires a post-insert update,
2355 * *ret_left_path will contain a valid path which can be passed to
2356 * ocfs2_insert_path().
2357 */
2358static int ocfs2_rotate_tree_right(handle_t *handle,
2359                   struct ocfs2_extent_tree *et,
2360                   enum ocfs2_split_type split,
2361                   u32 insert_cpos,
2362                   struct ocfs2_path *right_path,
2363                   struct ocfs2_path **ret_left_path)
2364{
2365    int ret, start, orig_credits = handle->h_buffer_credits;
2366    u32 cpos;
2367    struct ocfs2_path *left_path = NULL;
2368    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2369
2370    *ret_left_path = NULL;
2371
2372    left_path = ocfs2_new_path_from_path(right_path);
2373    if (!left_path) {
2374        ret = -ENOMEM;
2375        mlog_errno(ret);
2376        goto out;
2377    }
2378
2379    ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2380    if (ret) {
2381        mlog_errno(ret);
2382        goto out;
2383    }
2384
2385    trace_ocfs2_rotate_tree_right(
2386        (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2387        insert_cpos, cpos);
2388
2389    /*
2390     * What we want to do here is:
2391     *
2392     * 1) Start with the rightmost path.
2393     *
2394     * 2) Determine a path to the leaf block directly to the left
2395     * of that leaf.
2396     *
2397     * 3) Determine the 'subtree root' - the lowest level tree node
2398     * which contains a path to both leaves.
2399     *
2400     * 4) Rotate the subtree.
2401     *
2402     * 5) Find the next subtree by considering the left path to be
2403     * the new right path.
2404     *
2405     * The check at the top of this while loop also accepts
2406     * insert_cpos == cpos because cpos is only a _theoretical_
2407     * value to get us the left path - insert_cpos might very well
2408     * be filling that hole.
2409     *
2410     * Stop at a cpos of '0' because we either started at the
2411     * leftmost branch (i.e., a tree with one branch and a
2412     * rotation inside of it), or we've gone as far as we can in
2413     * rotating subtrees.
2414     */
2415    while (cpos && insert_cpos <= cpos) {
2416        trace_ocfs2_rotate_tree_right(
2417            (unsigned long long)
2418            ocfs2_metadata_cache_owner(et->et_ci),
2419            insert_cpos, cpos);
2420
2421        ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2422        if (ret) {
2423            mlog_errno(ret);
2424            goto out;
2425        }
2426
2427        mlog_bug_on_msg(path_leaf_bh(left_path) ==
2428                path_leaf_bh(right_path),
2429                "Owner %llu: error during insert of %u "
2430                "(left path cpos %u) results in two identical "
2431                "paths ending at %llu\n",
2432                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2433                insert_cpos, cpos,
2434                (unsigned long long)
2435                path_leaf_bh(left_path)->b_blocknr);
2436
2437        if (split == SPLIT_NONE &&
2438            ocfs2_rotate_requires_path_adjustment(left_path,
2439                              insert_cpos)) {
2440
2441            /*
2442             * We've rotated the tree as much as we
2443             * should. The rest is up to
2444             * ocfs2_insert_path() to complete, after the
2445             * record insertion. We indicate this
2446             * situation by returning the left path.
2447             *
2448             * The reason we don't adjust the records here
2449             * before the record insert is that an error
2450             * later might break the rule where a parent
2451             * record e_cpos will reflect the actual
2452             * e_cpos of the 1st nonempty record of the
2453             * child list.
2454             */
2455            *ret_left_path = left_path;
2456            goto out_ret_path;
2457        }
2458
2459        start = ocfs2_find_subtree_root(et, left_path, right_path);
2460
2461        trace_ocfs2_rotate_subtree(start,
2462            (unsigned long long)
2463            right_path->p_node[start].bh->b_blocknr,
2464            right_path->p_tree_depth);
2465
2466        ret = ocfs2_extend_rotate_transaction(handle, start,
2467                              orig_credits, right_path);
2468        if (ret) {
2469            mlog_errno(ret);
2470            goto out;
2471        }
2472
2473        ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2474                         right_path, start);
2475        if (ret) {
2476            mlog_errno(ret);
2477            goto out;
2478        }
2479
2480        if (split != SPLIT_NONE &&
2481            ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2482                        insert_cpos)) {
2483            /*
2484             * A rotate moves the rightmost left leaf
2485             * record over to the leftmost right leaf
2486             * slot. If we're doing an extent split
2487             * instead of a real insert, then we have to
2488             * check that the extent to be split wasn't
2489             * just moved over. If it was, then we can
2490             * exit here, passing left_path back -
2491             * ocfs2_split_extent() is smart enough to
2492             * search both leaves.
2493             */
2494            *ret_left_path = left_path;
2495            goto out_ret_path;
2496        }
2497
2498        /*
2499         * There is no need to re-read the next right path
2500         * as we know that it'll be our current left
2501         * path. Optimize by copying values instead.
2502         */
2503        ocfs2_mv_path(right_path, left_path);
2504
2505        ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2506        if (ret) {
2507            mlog_errno(ret);
2508            goto out;
2509        }
2510    }
2511
2512out:
2513    ocfs2_free_path(left_path);
2514
2515out_ret_path:
2516    return ret;
2517}
2518
2519static int ocfs2_update_edge_lengths(handle_t *handle,
2520                     struct ocfs2_extent_tree *et,
2521                     int subtree_index, struct ocfs2_path *path)
2522{
2523    int i, idx, ret;
2524    struct ocfs2_extent_rec *rec;
2525    struct ocfs2_extent_list *el;
2526    struct ocfs2_extent_block *eb;
2527    u32 range;
2528
2529    /*
2530     * In normal tree rotation process, we will never touch the
2531     * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2532     * doesn't reserve the credits for them either.
2533     *
2534     * But we do have a special case here which will update the rightmost
2535     * records for all the bh in the path.
2536     * So we have to allocate extra credits and access them.
2537     */
2538    ret = ocfs2_extend_trans(handle, subtree_index);
2539    if (ret) {
2540        mlog_errno(ret);
2541        goto out;
2542    }
2543
2544    ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2545    if (ret) {
2546        mlog_errno(ret);
2547        goto out;
2548    }
2549
2550    /* Path should always be rightmost. */
2551    eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2552    BUG_ON(eb->h_next_leaf_blk != 0ULL);
2553
2554    el = &eb->h_list;
2555    BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2556    idx = le16_to_cpu(el->l_next_free_rec) - 1;
2557    rec = &el->l_recs[idx];
2558    range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2559
2560    for (i = 0; i < path->p_tree_depth; i++) {
2561        el = path->p_node[i].el;
2562        idx = le16_to_cpu(el->l_next_free_rec) - 1;
2563        rec = &el->l_recs[idx];
2564
2565        rec->e_int_clusters = cpu_to_le32(range);
2566        le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2567
2568        ocfs2_journal_dirty(handle, path->p_node[i].bh);
2569    }
2570out:
2571    return ret;
2572}
2573
2574static void ocfs2_unlink_path(handle_t *handle,
2575                  struct ocfs2_extent_tree *et,
2576                  struct ocfs2_cached_dealloc_ctxt *dealloc,
2577                  struct ocfs2_path *path, int unlink_start)
2578{
2579    int ret, i;
2580    struct ocfs2_extent_block *eb;
2581    struct ocfs2_extent_list *el;
2582    struct buffer_head *bh;
2583
2584    for(i = unlink_start; i < path_num_items(path); i++) {
2585        bh = path->p_node[i].bh;
2586
2587        eb = (struct ocfs2_extent_block *)bh->b_data;
2588        /*
2589         * Not all nodes might have had their final count
2590         * decremented by the caller - handle this here.
2591         */
2592        el = &eb->h_list;
2593        if (le16_to_cpu(el->l_next_free_rec) > 1) {
2594            mlog(ML_ERROR,
2595                 "Inode %llu, attempted to remove extent block "
2596                 "%llu with %u records\n",
2597                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2598                 (unsigned long long)le64_to_cpu(eb->h_blkno),
2599                 le16_to_cpu(el->l_next_free_rec));
2600
2601            ocfs2_journal_dirty(handle, bh);
2602            ocfs2_remove_from_cache(et->et_ci, bh);
2603            continue;
2604        }
2605
2606        el->l_next_free_rec = 0;
2607        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2608
2609        ocfs2_journal_dirty(handle, bh);
2610
2611        ret = ocfs2_cache_extent_block_free(dealloc, eb);
2612        if (ret)
2613            mlog_errno(ret);
2614
2615        ocfs2_remove_from_cache(et->et_ci, bh);
2616    }
2617}
2618
2619static void ocfs2_unlink_subtree(handle_t *handle,
2620                 struct ocfs2_extent_tree *et,
2621                 struct ocfs2_path *left_path,
2622                 struct ocfs2_path *right_path,
2623                 int subtree_index,
2624                 struct ocfs2_cached_dealloc_ctxt *dealloc)
2625{
2626    int i;
2627    struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2628    struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2629    struct ocfs2_extent_list *el;
2630    struct ocfs2_extent_block *eb;
2631
2632    el = path_leaf_el(left_path);
2633
2634    eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2635
2636    for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2637        if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2638            break;
2639
2640    BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2641
2642    memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2643    le16_add_cpu(&root_el->l_next_free_rec, -1);
2644
2645    eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2646    eb->h_next_leaf_blk = 0;
2647
2648    ocfs2_journal_dirty(handle, root_bh);
2649    ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2650
2651    ocfs2_unlink_path(handle, et, dealloc, right_path,
2652              subtree_index + 1);
2653}
2654
2655static int ocfs2_rotate_subtree_left(handle_t *handle,
2656                     struct ocfs2_extent_tree *et,
2657                     struct ocfs2_path *left_path,
2658                     struct ocfs2_path *right_path,
2659                     int subtree_index,
2660                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2661                     int *deleted)
2662{
2663    int ret, i, del_right_subtree = 0, right_has_empty = 0;
2664    struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2665    struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2666    struct ocfs2_extent_block *eb;
2667
2668    *deleted = 0;
2669
2670    right_leaf_el = path_leaf_el(right_path);
2671    left_leaf_el = path_leaf_el(left_path);
2672    root_bh = left_path->p_node[subtree_index].bh;
2673    BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2674
2675    if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2676        return 0;
2677
2678    eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2679    if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2680        /*
2681         * It's legal for us to proceed if the right leaf is
2682         * the rightmost one and it has an empty extent. There
2683         * are two cases to handle - whether the leaf will be
2684         * empty after removal or not. If the leaf isn't empty
2685         * then just remove the empty extent up front. The
2686         * next block will handle empty leaves by flagging
2687         * them for unlink.
2688         *
2689         * Non rightmost leaves will throw -EAGAIN and the
2690         * caller can manually move the subtree and retry.
2691         */
2692
2693        if (eb->h_next_leaf_blk != 0ULL)
2694            return -EAGAIN;
2695
2696        if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2697            ret = ocfs2_journal_access_eb(handle, et->et_ci,
2698                              path_leaf_bh(right_path),
2699                              OCFS2_JOURNAL_ACCESS_WRITE);
2700            if (ret) {
2701                mlog_errno(ret);
2702                goto out;
2703            }
2704
2705            ocfs2_remove_empty_extent(right_leaf_el);
2706        } else
2707            right_has_empty = 1;
2708    }
2709
2710    if (eb->h_next_leaf_blk == 0ULL &&
2711        le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2712        /*
2713         * We have to update i_last_eb_blk during the meta
2714         * data delete.
2715         */
2716        ret = ocfs2_et_root_journal_access(handle, et,
2717                           OCFS2_JOURNAL_ACCESS_WRITE);
2718        if (ret) {
2719            mlog_errno(ret);
2720            goto out;
2721        }
2722
2723        del_right_subtree = 1;
2724    }
2725
2726    /*
2727     * Getting here with an empty extent in the right path implies
2728     * that it's the rightmost path and will be deleted.
2729     */
2730    BUG_ON(right_has_empty && !del_right_subtree);
2731
2732    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2733                       subtree_index);
2734    if (ret) {
2735        mlog_errno(ret);
2736        goto out;
2737    }
2738
2739    for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2740        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2741                           right_path, i);
2742        if (ret) {
2743            mlog_errno(ret);
2744            goto out;
2745        }
2746
2747        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2748                           left_path, i);
2749        if (ret) {
2750            mlog_errno(ret);
2751            goto out;
2752        }
2753    }
2754
2755    if (!right_has_empty) {
2756        /*
2757         * Only do this if we're moving a real
2758         * record. Otherwise, the action is delayed until
2759         * after removal of the right path in which case we
2760         * can do a simple shift to remove the empty extent.
2761         */
2762        ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2763        memset(&right_leaf_el->l_recs[0], 0,
2764               sizeof(struct ocfs2_extent_rec));
2765    }
2766    if (eb->h_next_leaf_blk == 0ULL) {
2767        /*
2768         * Move recs over to get rid of empty extent, decrease
2769         * next_free. This is allowed to remove the last
2770         * extent in our leaf (setting l_next_free_rec to
2771         * zero) - the delete code below won't care.
2772         */
2773        ocfs2_remove_empty_extent(right_leaf_el);
2774    }
2775
2776    ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2777    ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2778
2779    if (del_right_subtree) {
2780        ocfs2_unlink_subtree(handle, et, left_path, right_path,
2781                     subtree_index, dealloc);
2782        ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2783                        left_path);
2784        if (ret) {
2785            mlog_errno(ret);
2786            goto out;
2787        }
2788
2789        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2790        ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2791
2792        /*
2793         * Removal of the extent in the left leaf was skipped
2794         * above so we could delete the right path
2795         * 1st.
2796         */
2797        if (right_has_empty)
2798            ocfs2_remove_empty_extent(left_leaf_el);
2799
2800        ocfs2_journal_dirty(handle, et_root_bh);
2801
2802        *deleted = 1;
2803    } else
2804        ocfs2_complete_edge_insert(handle, left_path, right_path,
2805                       subtree_index);
2806
2807out:
2808    return ret;
2809}
2810
2811/*
2812 * Given a full path, determine what cpos value would return us a path
2813 * containing the leaf immediately to the right of the current one.
2814 *
2815 * Will return zero if the path passed in is already the rightmost path.
2816 *
2817 * This looks similar, but is subtly different to
2818 * ocfs2_find_cpos_for_left_leaf().
2819 */
2820int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2821                   struct ocfs2_path *path, u32 *cpos)
2822{
2823    int i, j, ret = 0;
2824    u64 blkno;
2825    struct ocfs2_extent_list *el;
2826
2827    *cpos = 0;
2828
2829    if (path->p_tree_depth == 0)
2830        return 0;
2831
2832    blkno = path_leaf_bh(path)->b_blocknr;
2833
2834    /* Start at the tree node just above the leaf and work our way up. */
2835    i = path->p_tree_depth - 1;
2836    while (i >= 0) {
2837        int next_free;
2838
2839        el = path->p_node[i].el;
2840
2841        /*
2842         * Find the extent record just after the one in our
2843         * path.
2844         */
2845        next_free = le16_to_cpu(el->l_next_free_rec);
2846        for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2847            if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2848                if (j == (next_free - 1)) {
2849                    if (i == 0) {
2850                        /*
2851                         * We've determined that the
2852                         * path specified is already
2853                         * the rightmost one - return a
2854                         * cpos of zero.
2855                         */
2856                        goto out;
2857                    }
2858                    /*
2859                     * The rightmost record points to our
2860                     * leaf - we need to travel up the
2861                     * tree one level.
2862                     */
2863                    goto next_node;
2864                }
2865
2866                *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2867                goto out;
2868            }
2869        }
2870
2871        /*
2872         * If we got here, we never found a valid node where
2873         * the tree indicated one should be.
2874         */
2875        ocfs2_error(sb,
2876                "Invalid extent tree at extent block %llu\n",
2877                (unsigned long long)blkno);
2878        ret = -EROFS;
2879        goto out;
2880
2881next_node:
2882        blkno = path->p_node[i].bh->b_blocknr;
2883        i--;
2884    }
2885
2886out:
2887    return ret;
2888}
2889
2890static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2891                        struct ocfs2_extent_tree *et,
2892                        struct ocfs2_path *path)
2893{
2894    int ret;
2895    struct buffer_head *bh = path_leaf_bh(path);
2896    struct ocfs2_extent_list *el = path_leaf_el(path);
2897
2898    if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2899        return 0;
2900
2901    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2902                       path_num_items(path) - 1);
2903    if (ret) {
2904        mlog_errno(ret);
2905        goto out;
2906    }
2907
2908    ocfs2_remove_empty_extent(el);
2909    ocfs2_journal_dirty(handle, bh);
2910
2911out:
2912    return ret;
2913}
2914
2915static int __ocfs2_rotate_tree_left(handle_t *handle,
2916                    struct ocfs2_extent_tree *et,
2917                    int orig_credits,
2918                    struct ocfs2_path *path,
2919                    struct ocfs2_cached_dealloc_ctxt *dealloc,
2920                    struct ocfs2_path **empty_extent_path)
2921{
2922    int ret, subtree_root, deleted;
2923    u32 right_cpos;
2924    struct ocfs2_path *left_path = NULL;
2925    struct ocfs2_path *right_path = NULL;
2926    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2927
2928    BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2929
2930    *empty_extent_path = NULL;
2931
2932    ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2933    if (ret) {
2934        mlog_errno(ret);
2935        goto out;
2936    }
2937
2938    left_path = ocfs2_new_path_from_path(path);
2939    if (!left_path) {
2940        ret = -ENOMEM;
2941        mlog_errno(ret);
2942        goto out;
2943    }
2944
2945    ocfs2_cp_path(left_path, path);
2946
2947    right_path = ocfs2_new_path_from_path(path);
2948    if (!right_path) {
2949        ret = -ENOMEM;
2950        mlog_errno(ret);
2951        goto out;
2952    }
2953
2954    while (right_cpos) {
2955        ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2956        if (ret) {
2957            mlog_errno(ret);
2958            goto out;
2959        }
2960
2961        subtree_root = ocfs2_find_subtree_root(et, left_path,
2962                               right_path);
2963
2964        trace_ocfs2_rotate_subtree(subtree_root,
2965             (unsigned long long)
2966             right_path->p_node[subtree_root].bh->b_blocknr,
2967             right_path->p_tree_depth);
2968
2969        ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2970                              orig_credits, left_path);
2971        if (ret) {
2972            mlog_errno(ret);
2973            goto out;
2974        }
2975
2976        /*
2977         * Caller might still want to make changes to the
2978         * tree root, so re-add it to the journal here.
2979         */
2980        ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2981                           left_path, 0);
2982        if (ret) {
2983            mlog_errno(ret);
2984            goto out;
2985        }
2986
2987        ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2988                        right_path, subtree_root,
2989                        dealloc, &deleted);
2990        if (ret == -EAGAIN) {
2991            /*
2992             * The rotation has to temporarily stop due to
2993             * the right subtree having an empty
2994             * extent. Pass it back to the caller for a
2995             * fixup.
2996             */
2997            *empty_extent_path = right_path;
2998            right_path = NULL;
2999            goto out;
3000        }
3001        if (ret) {
3002            mlog_errno(ret);
3003            goto out;
3004        }
3005
3006        /*
3007         * The subtree rotate might have removed records on
3008         * the rightmost edge. If so, then rotation is
3009         * complete.
3010         */
3011        if (deleted)
3012            break;
3013
3014        ocfs2_mv_path(left_path, right_path);
3015
3016        ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
3017                             &right_cpos);
3018        if (ret) {
3019            mlog_errno(ret);
3020            goto out;
3021        }
3022    }
3023
3024out:
3025    ocfs2_free_path(right_path);
3026    ocfs2_free_path(left_path);
3027
3028    return ret;
3029}
3030
3031static int ocfs2_remove_rightmost_path(handle_t *handle,
3032                struct ocfs2_extent_tree *et,
3033                struct ocfs2_path *path,
3034                struct ocfs2_cached_dealloc_ctxt *dealloc)
3035{
3036    int ret, subtree_index;
3037    u32 cpos;
3038    struct ocfs2_path *left_path = NULL;
3039    struct ocfs2_extent_block *eb;
3040    struct ocfs2_extent_list *el;
3041
3042
3043    ret = ocfs2_et_sanity_check(et);
3044    if (ret)
3045        goto out;
3046    /*
3047     * There's two ways we handle this depending on
3048     * whether path is the only existing one.
3049     */
3050    ret = ocfs2_extend_rotate_transaction(handle, 0,
3051                          handle->h_buffer_credits,
3052                          path);
3053    if (ret) {
3054        mlog_errno(ret);
3055        goto out;
3056    }
3057
3058    ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3059    if (ret) {
3060        mlog_errno(ret);
3061        goto out;
3062    }
3063
3064    ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3065                        path, &cpos);
3066    if (ret) {
3067        mlog_errno(ret);
3068        goto out;
3069    }
3070
3071    if (cpos) {
3072        /*
3073         * We have a path to the left of this one - it needs
3074         * an update too.
3075         */
3076        left_path = ocfs2_new_path_from_path(path);
3077        if (!left_path) {
3078            ret = -ENOMEM;
3079            mlog_errno(ret);
3080            goto out;
3081        }
3082
3083        ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3084        if (ret) {
3085            mlog_errno(ret);
3086            goto out;
3087        }
3088
3089        ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3090        if (ret) {
3091            mlog_errno(ret);
3092            goto out;
3093        }
3094
3095        subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3096
3097        ocfs2_unlink_subtree(handle, et, left_path, path,
3098                     subtree_index, dealloc);
3099        ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3100                        left_path);
3101        if (ret) {
3102            mlog_errno(ret);
3103            goto out;
3104        }
3105
3106        eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3107        ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3108    } else {
3109        /*
3110         * 'path' is also the leftmost path which
3111         * means it must be the only one. This gets
3112         * handled differently because we want to
3113         * revert the root back to having extents
3114         * in-line.
3115         */
3116        ocfs2_unlink_path(handle, et, dealloc, path, 1);
3117
3118        el = et->et_root_el;
3119        el->l_tree_depth = 0;
3120        el->l_next_free_rec = 0;
3121        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3122
3123        ocfs2_et_set_last_eb_blk(et, 0);
3124    }
3125
3126    ocfs2_journal_dirty(handle, path_root_bh(path));
3127
3128out:
3129    ocfs2_free_path(left_path);
3130    return ret;
3131}
3132
3133/*
3134 * Left rotation of btree records.
3135 *
3136 * In many ways, this is (unsurprisingly) the opposite of right
3137 * rotation. We start at some non-rightmost path containing an empty
3138 * extent in the leaf block. The code works its way to the rightmost
3139 * path by rotating records to the left in every subtree.
3140 *
3141 * This is used by any code which reduces the number of extent records
3142 * in a leaf. After removal, an empty record should be placed in the
3143 * leftmost list position.
3144 *
3145 * This won't handle a length update of the rightmost path records if
3146 * the rightmost tree leaf record is removed so the caller is
3147 * responsible for detecting and correcting that.
3148 */
3149static int ocfs2_rotate_tree_left(handle_t *handle,
3150                  struct ocfs2_extent_tree *et,
3151                  struct ocfs2_path *path,
3152                  struct ocfs2_cached_dealloc_ctxt *dealloc)
3153{
3154    int ret, orig_credits = handle->h_buffer_credits;
3155    struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3156    struct ocfs2_extent_block *eb;
3157    struct ocfs2_extent_list *el;
3158
3159    el = path_leaf_el(path);
3160    if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3161        return 0;
3162
3163    if (path->p_tree_depth == 0) {
3164rightmost_no_delete:
3165        /*
3166         * Inline extents. This is trivially handled, so do
3167         * it up front.
3168         */
3169        ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3170        if (ret)
3171            mlog_errno(ret);
3172        goto out;
3173    }
3174
3175    /*
3176     * Handle rightmost branch now. There's several cases:
3177     * 1) simple rotation leaving records in there. That's trivial.
3178     * 2) rotation requiring a branch delete - there's no more
3179     * records left. Two cases of this:
3180     * a) There are branches to the left.
3181     * b) This is also the leftmost (the only) branch.
3182     *
3183     * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
3184     * 2a) we need the left branch so that we can update it with the unlink
3185     * 2b) we need to bring the root back to inline extents.
3186     */
3187
3188    eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3189    el = &eb->h_list;
3190    if (eb->h_next_leaf_blk == 0) {
3191        /*
3192         * This gets a bit tricky if we're going to delete the
3193         * rightmost path. Get the other cases out of the way
3194         * 1st.
3195         */
3196        if (le16_to_cpu(el->l_next_free_rec) > 1)
3197            goto rightmost_no_delete;
3198
3199        if (le16_to_cpu(el->l_next_free_rec) == 0) {
3200            ret = -EIO;
3201            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3202                    "Owner %llu has empty extent block at %llu",
3203                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3204                    (unsigned long long)le64_to_cpu(eb->h_blkno));
3205            goto out;
3206        }
3207
3208        /*
3209         * XXX: The caller can not trust "path" any more after
3210         * this as it will have been deleted. What do we do?
3211         *
3212         * In theory the rotate-for-merge code will never get
3213         * here because it'll always ask for a rotate in a
3214         * nonempty list.
3215         */
3216
3217        ret = ocfs2_remove_rightmost_path(handle, et, path,
3218                          dealloc);
3219        if (ret)
3220            mlog_errno(ret);
3221        goto out;
3222    }
3223
3224    /*
3225     * Now we can loop, remembering the path we get from -EAGAIN
3226     * and restarting from there.
3227     */
3228try_rotate:
3229    ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3230                       dealloc, &restart_path);
3231    if (ret && ret != -EAGAIN) {
3232        mlog_errno(ret);
3233        goto out;
3234    }
3235
3236    while (ret == -EAGAIN) {
3237        tmp_path = restart_path;
3238        restart_path = NULL;
3239
3240        ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3241                           tmp_path, dealloc,
3242                           &restart_path);
3243        if (ret && ret != -EAGAIN) {
3244            mlog_errno(ret);
3245            goto out;
3246        }
3247
3248        ocfs2_free_path(tmp_path);
3249        tmp_path = NULL;
3250
3251        if (ret == 0)
3252            goto try_rotate;
3253    }
3254
3255out:
3256    ocfs2_free_path(tmp_path);
3257    ocfs2_free_path(restart_path);
3258    return ret;
3259}
3260
3261static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3262                int index)
3263{
3264    struct ocfs2_extent_rec *rec = &el->l_recs[index];
3265    unsigned int size;
3266
3267    if (rec->e_leaf_clusters == 0) {
3268        /*
3269         * We consumed all of the merged-from record. An empty
3270         * extent cannot exist anywhere but the 1st array
3271         * position, so move things over if the merged-from
3272         * record doesn't occupy that position.
3273         *
3274         * This creates a new empty extent so the caller
3275         * should be smart enough to have removed any existing
3276         * ones.
3277         */
3278        if (index > 0) {
3279            BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3280            size = index * sizeof(struct ocfs2_extent_rec);
3281            memmove(&el->l_recs[1], &el->l_recs[0], size);
3282        }
3283
3284        /*
3285         * Always memset - the caller doesn't check whether it
3286         * created an empty extent, so there could be junk in
3287         * the other fields.
3288         */
3289        memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3290    }
3291}
3292
3293static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3294                struct ocfs2_path *left_path,
3295                struct ocfs2_path **ret_right_path)
3296{
3297    int ret;
3298    u32 right_cpos;
3299    struct ocfs2_path *right_path = NULL;
3300    struct ocfs2_extent_list *left_el;
3301
3302    *ret_right_path = NULL;
3303
3304    /* This function shouldn't be called for non-trees. */
3305    BUG_ON(left_path->p_tree_depth == 0);
3306
3307    left_el = path_leaf_el(left_path);
3308    BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3309
3310    ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3311                         left_path, &right_cpos);
3312    if (ret) {
3313        mlog_errno(ret);
3314        goto out;
3315    }
3316
3317    /* This function shouldn't be called for the rightmost leaf. */
3318    BUG_ON(right_cpos == 0);
3319
3320    right_path = ocfs2_new_path_from_path(left_path);
3321    if (!right_path) {
3322        ret = -ENOMEM;
3323        mlog_errno(ret);
3324        goto out;
3325    }
3326
3327    ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3328    if (ret) {
3329        mlog_errno(ret);
3330        goto out;
3331    }
3332
3333    *ret_right_path = right_path;
3334out:
3335    if (ret)
3336        ocfs2_free_path(right_path);
3337    return ret;
3338}
3339
3340/*
3341 * Remove split_rec clusters from the record at index and merge them
3342 * onto the beginning of the record "next" to it.
3343 * For index < l_count - 1, the next means the extent rec at index + 1.
3344 * For index == l_count - 1, the "next" means the 1st extent rec of the
3345 * next extent block.
3346 */
3347static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3348                 handle_t *handle,
3349                 struct ocfs2_extent_tree *et,
3350                 struct ocfs2_extent_rec *split_rec,
3351                 int index)
3352{
3353    int ret, next_free, i;
3354    unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3355    struct ocfs2_extent_rec *left_rec;
3356    struct ocfs2_extent_rec *right_rec;
3357    struct ocfs2_extent_list *right_el;
3358    struct ocfs2_path *right_path = NULL;
3359    int subtree_index = 0;
3360    struct ocfs2_extent_list *el = path_leaf_el(left_path);
3361    struct buffer_head *bh = path_leaf_bh(left_path);
3362    struct buffer_head *root_bh = NULL;
3363
3364    BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3365    left_rec = &el->l_recs[index];
3366
3367    if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3368        le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3369        /* we meet with a cross extent block merge. */
3370        ret = ocfs2_get_right_path(et, left_path, &right_path);
3371        if (ret) {
3372            mlog_errno(ret);
3373            goto out;
3374        }
3375
3376        right_el = path_leaf_el(right_path);
3377        next_free = le16_to_cpu(right_el->l_next_free_rec);
3378        BUG_ON(next_free <= 0);
3379        right_rec = &right_el->l_recs[0];
3380        if (ocfs2_is_empty_extent(right_rec)) {
3381            BUG_ON(next_free <= 1);
3382            right_rec = &right_el->l_recs[1];
3383        }
3384
3385        BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3386               le16_to_cpu(left_rec->e_leaf_clusters) !=
3387               le32_to_cpu(right_rec->e_cpos));
3388
3389        subtree_index = ocfs2_find_subtree_root(et, left_path,
3390                            right_path);
3391
3392        ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3393                              handle->h_buffer_credits,
3394                              right_path);
3395        if (ret) {
3396            mlog_errno(ret);
3397            goto out;
3398        }
3399
3400        root_bh = left_path->p_node[subtree_index].bh;
3401        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3402
3403        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3404                           subtree_index);
3405        if (ret) {
3406            mlog_errno(ret);
3407            goto out;
3408        }
3409
3410        for (i = subtree_index + 1;
3411             i < path_num_items(right_path); i++) {
3412            ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3413                               right_path, i);
3414            if (ret) {
3415                mlog_errno(ret);
3416                goto out;
3417            }
3418
3419            ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3420                               left_path, i);
3421            if (ret) {
3422                mlog_errno(ret);
3423                goto out;
3424            }
3425        }
3426
3427    } else {
3428        BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3429        right_rec = &el->l_recs[index + 1];
3430    }
3431
3432    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3433                       path_num_items(left_path) - 1);
3434    if (ret) {
3435        mlog_errno(ret);
3436        goto out;
3437    }
3438
3439    le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3440
3441    le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3442    le64_add_cpu(&right_rec->e_blkno,
3443             -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3444                           split_clusters));
3445    le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3446
3447    ocfs2_cleanup_merge(el, index);
3448
3449    ocfs2_journal_dirty(handle, bh);
3450    if (right_path) {
3451        ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3452        ocfs2_complete_edge_insert(handle, left_path, right_path,
3453                       subtree_index);
3454    }
3455out:
3456    if (right_path)
3457        ocfs2_free_path(right_path);
3458    return ret;
3459}
3460
3461static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3462                   struct ocfs2_path *right_path,
3463                   struct ocfs2_path **ret_left_path)
3464{
3465    int ret;
3466    u32 left_cpos;
3467    struct ocfs2_path *left_path = NULL;
3468
3469    *ret_left_path = NULL;
3470
3471    /* This function shouldn't be called for non-trees. */
3472    BUG_ON(right_path->p_tree_depth == 0);
3473
3474    ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3475                        right_path, &left_cpos);
3476    if (ret) {
3477        mlog_errno(ret);
3478        goto out;
3479    }
3480
3481    /* This function shouldn't be called for the leftmost leaf. */
3482    BUG_ON(left_cpos == 0);
3483
3484    left_path = ocfs2_new_path_from_path(right_path);
3485    if (!left_path) {
3486        ret = -ENOMEM;
3487        mlog_errno(ret);
3488        goto out;
3489    }
3490
3491    ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3492    if (ret) {
3493        mlog_errno(ret);
3494        goto out;
3495    }
3496
3497    *ret_left_path = left_path;
3498out:
3499    if (ret)
3500        ocfs2_free_path(left_path);
3501    return ret;
3502}
3503
3504/*
3505 * Remove split_rec clusters from the record at index and merge them
3506 * onto the tail of the record "before" it.
3507 * For index > 0, the "before" means the extent rec at index - 1.
3508 *
3509 * For index == 0, the "before" means the last record of the previous
3510 * extent block. And there is also a situation that we may need to
3511 * remove the rightmost leaf extent block in the right_path and change
3512 * the right path to indicate the new rightmost path.
3513 */
3514static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3515                handle_t *handle,
3516                struct ocfs2_extent_tree *et,
3517                struct ocfs2_extent_rec *split_rec,
3518                struct ocfs2_cached_dealloc_ctxt *dealloc,
3519                int index)
3520{
3521    int ret, i, subtree_index = 0, has_empty_extent = 0;
3522    unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3523    struct ocfs2_extent_rec *left_rec;
3524    struct ocfs2_extent_rec *right_rec;
3525    struct ocfs2_extent_list *el = path_leaf_el(right_path);
3526    struct buffer_head *bh = path_leaf_bh(right_path);
3527    struct buffer_head *root_bh = NULL;
3528    struct ocfs2_path *left_path = NULL;
3529    struct ocfs2_extent_list *left_el;
3530
3531    BUG_ON(index < 0);
3532
3533    right_rec = &el->l_recs[index];
3534    if (index == 0) {
3535        /* we meet with a cross extent block merge. */
3536        ret = ocfs2_get_left_path(et, right_path, &left_path);
3537        if (ret) {
3538            mlog_errno(ret);
3539            goto out;
3540        }
3541
3542        left_el = path_leaf_el(left_path);
3543        BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3544               le16_to_cpu(left_el->l_count));
3545
3546        left_rec = &left_el->l_recs[
3547                le16_to_cpu(left_el->l_next_free_rec) - 1];
3548        BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3549               le16_to_cpu(left_rec->e_leaf_clusters) !=
3550               le32_to_cpu(split_rec->e_cpos));
3551
3552        subtree_index = ocfs2_find_subtree_root(et, left_path,
3553                            right_path);
3554
3555        ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3556                              handle->h_buffer_credits,
3557                              left_path);
3558        if (ret) {
3559            mlog_errno(ret);
3560            goto out;
3561        }
3562
3563        root_bh = left_path->p_node[subtree_index].bh;
3564        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3565
3566        ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3567                           subtree_index);
3568        if (ret) {
3569            mlog_errno(ret);
3570            goto out;
3571        }
3572
3573        for (i = subtree_index + 1;
3574             i < path_num_items(right_path); i++) {
3575            ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3576                               right_path, i);
3577            if (ret) {
3578                mlog_errno(ret);
3579                goto out;
3580            }
3581
3582            ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3583                               left_path, i);
3584            if (ret) {
3585                mlog_errno(ret);
3586                goto out;
3587            }
3588        }
3589    } else {
3590        left_rec = &el->l_recs[index - 1];
3591        if (ocfs2_is_empty_extent(&el->l_recs[0]))
3592            has_empty_extent = 1;
3593    }
3594
3595    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3596                       path_num_items(right_path) - 1);
3597    if (ret) {
3598        mlog_errno(ret);
3599        goto out;
3600    }
3601
3602    if (has_empty_extent && index == 1) {
3603        /*
3604         * The easy case - we can just plop the record right in.
3605         */
3606        *left_rec = *split_rec;
3607
3608        has_empty_extent = 0;
3609    } else
3610        le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3611
3612    le32_add_cpu(&right_rec->e_cpos, split_clusters);
3613    le64_add_cpu(&right_rec->e_blkno,
3614             ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3615                          split_clusters));
3616    le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3617
3618    ocfs2_cleanup_merge(el, index);
3619
3620    ocfs2_journal_dirty(handle, bh);
3621    if (left_path) {
3622        ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3623
3624        /*
3625         * In the situation that the right_rec is empty and the extent
3626         * block is empty also, ocfs2_complete_edge_insert can't handle
3627         * it and we need to delete the right extent block.
3628         */
3629        if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3630            le16_to_cpu(el->l_next_free_rec) == 1) {
3631
3632            ret = ocfs2_remove_rightmost_path(handle, et,
3633                              right_path,
3634                              dealloc);
3635            if (ret) {
3636                mlog_errno(ret);
3637                goto out;
3638            }
3639
3640            /* Now the rightmost extent block has been deleted.
3641             * So we use the new rightmost path.
3642             */
3643            ocfs2_mv_path(right_path, left_path);
3644            left_path = NULL;
3645        } else
3646            ocfs2_complete_edge_insert(handle, left_path,
3647                           right_path, subtree_index);
3648    }
3649out:
3650    if (left_path)
3651        ocfs2_free_path(left_path);
3652    return ret;
3653}
3654
3655static int ocfs2_try_to_merge_extent(handle_t *handle,
3656                     struct ocfs2_extent_tree *et,
3657                     struct ocfs2_path *path,
3658                     int split_index,
3659                     struct ocfs2_extent_rec *split_rec,
3660                     struct ocfs2_cached_dealloc_ctxt *dealloc,
3661                     struct ocfs2_merge_ctxt *ctxt)
3662{
3663    int ret = 0;
3664    struct ocfs2_extent_list *el = path_leaf_el(path);
3665    struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3666
3667    BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3668
3669    if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3670        /*
3671         * The merge code will need to create an empty
3672         * extent to take the place of the newly
3673         * emptied slot. Remove any pre-existing empty
3674         * extents - having more than one in a leaf is
3675         * illegal.
3676         */
3677        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3678        if (ret) {
3679            mlog_errno(ret);
3680            goto out;
3681        }
3682        split_index--;
3683        rec = &el->l_recs[split_index];
3684    }
3685
3686    if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3687        /*
3688         * Left-right contig implies this.
3689         */
3690        BUG_ON(!ctxt->c_split_covers_rec);
3691
3692        /*
3693         * Since the leftright insert always covers the entire
3694         * extent, this call will delete the insert record
3695         * entirely, resulting in an empty extent record added to
3696         * the extent block.
3697         *
3698         * Since the adding of an empty extent shifts
3699         * everything back to the right, there's no need to
3700         * update split_index here.
3701         *
3702         * When the split_index is zero, we need to merge it to the
3703         * prevoius extent block. It is more efficient and easier
3704         * if we do merge_right first and merge_left later.
3705         */
3706        ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3707                        split_index);
3708        if (ret) {
3709            mlog_errno(ret);
3710            goto out;
3711        }
3712
3713        /*
3714         * We can only get this from logic error above.
3715         */
3716        BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3717
3718        /* The merge left us with an empty extent, remove it. */
3719        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3720        if (ret) {
3721            mlog_errno(ret);
3722            goto out;
3723        }
3724
3725        rec = &el->l_recs[split_index];
3726
3727        /*
3728         * Note that we don't pass split_rec here on purpose -
3729         * we've merged it into the rec already.
3730         */
3731        ret = ocfs2_merge_rec_left(path, handle, et, rec,
3732                       dealloc, split_index);
3733
3734        if (ret) {
3735            mlog_errno(ret);
3736            goto out;
3737        }
3738
3739        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3740        /*
3741         * Error from this last rotate is not critical, so
3742         * print but don't bubble it up.
3743         */
3744        if (ret)
3745            mlog_errno(ret);
3746        ret = 0;
3747    } else {
3748        /*
3749         * Merge a record to the left or right.
3750         *
3751         * 'contig_type' is relative to the existing record,
3752         * so for example, if we're "right contig", it's to
3753         * the record on the left (hence the left merge).
3754         */
3755        if (ctxt->c_contig_type == CONTIG_RIGHT) {
3756            ret = ocfs2_merge_rec_left(path, handle, et,
3757                           split_rec, dealloc,
3758                           split_index);
3759            if (ret) {
3760                mlog_errno(ret);
3761                goto out;
3762            }
3763        } else {
3764            ret = ocfs2_merge_rec_right(path, handle,
3765                            et, split_rec,
3766                            split_index);
3767            if (ret) {
3768                mlog_errno(ret);
3769                goto out;
3770            }
3771        }
3772
3773        if (ctxt->c_split_covers_rec) {
3774            /*
3775             * The merge may have left an empty extent in
3776             * our leaf. Try to rotate it away.
3777             */
3778            ret = ocfs2_rotate_tree_left(handle, et, path,
3779                             dealloc);
3780            if (ret)
3781                mlog_errno(ret);
3782            ret = 0;
3783        }
3784    }
3785
3786out:
3787    return ret;
3788}
3789
3790static void ocfs2_subtract_from_rec(struct super_block *sb,
3791                    enum ocfs2_split_type split,
3792                    struct ocfs2_extent_rec *rec,
3793                    struct ocfs2_extent_rec *split_rec)
3794{
3795    u64 len_blocks;
3796
3797    len_blocks = ocfs2_clusters_to_blocks(sb,
3798                le16_to_cpu(split_rec->e_leaf_clusters));
3799
3800    if (split == SPLIT_LEFT) {
3801        /*
3802         * Region is on the left edge of the existing
3803         * record.
3804         */
3805        le32_add_cpu(&rec->e_cpos,
3806                 le16_to_cpu(split_rec->e_leaf_clusters));
3807        le64_add_cpu(&rec->e_blkno, len_blocks);
3808        le16_add_cpu(&rec->e_leaf_clusters,
3809                 -le16_to_cpu(split_rec->e_leaf_clusters));
3810    } else {
3811        /*
3812         * Region is on the right edge of the existing
3813         * record.
3814         */
3815        le16_add_cpu(&rec->e_leaf_clusters,
3816                 -le16_to_cpu(split_rec->e_leaf_clusters));
3817    }
3818}
3819
3820/*
3821 * Do the final bits of extent record insertion at the target leaf
3822 * list. If this leaf is part of an allocation tree, it is assumed
3823 * that the tree above has been prepared.
3824 */
3825static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3826                 struct ocfs2_extent_rec *insert_rec,
3827                 struct ocfs2_extent_list *el,
3828                 struct ocfs2_insert_type *insert)
3829{
3830    int i = insert->ins_contig_index;
3831    unsigned int range;
3832    struct ocfs2_extent_rec *rec;
3833
3834    BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3835
3836    if (insert->ins_split != SPLIT_NONE) {
3837        i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3838        BUG_ON(i == -1);
3839        rec = &el->l_recs[i];
3840        ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3841                    insert->ins_split, rec,
3842                    insert_rec);
3843        goto rotate;
3844    }
3845
3846    /*
3847     * Contiguous insert - either left or right.
3848     */
3849    if (insert->ins_contig != CONTIG_NONE) {
3850        rec = &el->l_recs[i];
3851        if (insert->ins_contig == CONTIG_LEFT) {
3852            rec->e_blkno = insert_rec->e_blkno;
3853            rec->e_cpos = insert_rec->e_cpos;
3854        }
3855        le16_add_cpu(&rec->e_leaf_clusters,
3856                 le16_to_cpu(insert_rec->e_leaf_clusters));
3857        return;
3858    }
3859
3860    /*
3861     * Handle insert into an empty leaf.
3862     */
3863    if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3864        ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3865         ocfs2_is_empty_extent(&el->l_recs[0]))) {
3866        el->l_recs[0] = *insert_rec;
3867        el->l_next_free_rec = cpu_to_le16(1);
3868        return;
3869    }
3870
3871    /*
3872     * Appending insert.
3873     */
3874    if (insert->ins_appending == APPEND_TAIL) {
3875        i = le16_to_cpu(el->l_next_free_rec) - 1;
3876        rec = &el->l_recs[i];
3877        range = le32_to_cpu(rec->e_cpos)
3878            + le16_to_cpu(rec->e_leaf_clusters);
3879        BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3880
3881        mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3882                le16_to_cpu(el->l_count),
3883                "owner %llu, depth %u, count %u, next free %u, "
3884                "rec.cpos %u, rec.clusters %u, "
3885                "insert.cpos %u, insert.clusters %u\n",
3886                ocfs2_metadata_cache_owner(et->et_ci),
3887                le16_to_cpu(el->l_tree_depth),
3888                le16_to_cpu(el->l_count),
3889                le16_to_cpu(el->l_next_free_rec),
3890                le32_to_cpu(el->l_recs[i].e_cpos),
3891                le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3892                le32_to_cpu(insert_rec->e_cpos),
3893                le16_to_cpu(insert_rec->e_leaf_clusters));
3894        i++;
3895        el->l_recs[i] = *insert_rec;
3896        le16_add_cpu(&el->l_next_free_rec, 1);
3897        return;
3898    }
3899
3900rotate:
3901    /*
3902     * Ok, we have to rotate.
3903     *
3904     * At this point, it is safe to assume that inserting into an
3905     * empty leaf and appending to a leaf have both been handled
3906     * above.
3907     *
3908     * This leaf needs to have space, either by the empty 1st
3909     * extent record, or by virtue of an l_next_rec < l_count.
3910     */
3911    ocfs2_rotate_leaf(el, insert_rec);
3912}
3913
3914static void ocfs2_adjust_rightmost_records(handle_t *handle,
3915                       struct ocfs2_extent_tree *et,
3916                       struct ocfs2_path *path,
3917                       struct ocfs2_extent_rec *insert_rec)
3918{
3919    int ret, i, next_free;
3920    struct buffer_head *bh;
3921    struct ocfs2_extent_list *el;
3922    struct ocfs2_extent_rec *rec;
3923
3924    /*
3925     * Update everything except the leaf block.
3926     */
3927    for (i = 0; i < path->p_tree_depth; i++) {
3928        bh = path->p_node[i].bh;
3929        el = path->p_node[i].el;
3930
3931        next_free = le16_to_cpu(el->l_next_free_rec);
3932        if (next_free == 0) {
3933            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3934                    "Owner %llu has a bad extent list",
3935                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3936            ret = -EIO;
3937            return;
3938        }
3939
3940        rec = &el->l_recs[next_free - 1];
3941
3942        rec->e_int_clusters = insert_rec->e_cpos;
3943        le32_add_cpu(&rec->e_int_clusters,
3944                 le16_to_cpu(insert_rec->e_leaf_clusters));
3945        le32_add_cpu(&rec->e_int_clusters,
3946                 -le32_to_cpu(rec->e_cpos));
3947
3948        ocfs2_journal_dirty(handle, bh);
3949    }
3950}
3951
3952static int ocfs2_append_rec_to_path(handle_t *handle,
3953                    struct ocfs2_extent_tree *et,
3954                    struct ocfs2_extent_rec *insert_rec,
3955                    struct ocfs2_path *right_path,
3956                    struct ocfs2_path **ret_left_path)
3957{
3958    int ret, next_free;
3959    struct ocfs2_extent_list *el;
3960    struct ocfs2_path *left_path = NULL;
3961
3962    *ret_left_path = NULL;
3963
3964    /*
3965     * This shouldn't happen for non-trees. The extent rec cluster
3966     * count manipulation below only works for interior nodes.
3967     */
3968    BUG_ON(right_path->p_tree_depth == 0);
3969
3970    /*
3971     * If our appending insert is at the leftmost edge of a leaf,
3972     * then we might need to update the rightmost records of the
3973     * neighboring path.
3974     */
3975    el = path_leaf_el(right_path);
3976    next_free = le16_to_cpu(el->l_next_free_rec);
3977    if (next_free == 0 ||
3978        (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3979        u32 left_cpos;
3980
3981        ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3982                            right_path, &left_cpos);
3983        if (ret) {
3984            mlog_errno(ret);
3985            goto out;
3986        }
3987
3988        trace_ocfs2_append_rec_to_path(
3989            (unsigned long long)
3990            ocfs2_metadata_cache_owner(et->et_ci),
3991            le32_to_cpu(insert_rec->e_cpos),
3992            left_cpos);
3993
3994        /*
3995         * No need to worry if the append is already in the
3996         * leftmost leaf.
3997         */
3998        if (left_cpos) {
3999            left_path = ocfs2_new_path_from_path(right_path);
4000            if (!left_path) {
4001                ret = -ENOMEM;
4002                mlog_errno(ret);
4003                goto out;
4004            }
4005
4006            ret = ocfs2_find_path(et->et_ci, left_path,
4007                          left_cpos);
4008            if (ret) {
4009                mlog_errno(ret);
4010                goto out;
4011            }
4012
4013            /*
4014             * ocfs2_insert_path() will pass the left_path to the
4015             * journal for us.
4016             */
4017        }
4018    }
4019
4020    ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4021    if (ret) {
4022        mlog_errno(ret);
4023        goto out;
4024    }
4025
4026    ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4027
4028    *ret_left_path = left_path;
4029    ret = 0;
4030out:
4031    if (ret != 0)
4032        ocfs2_free_path(left_path);
4033
4034    return ret;
4035}
4036
4037static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4038                   struct ocfs2_path *left_path,
4039                   struct ocfs2_path *right_path,
4040                   struct ocfs2_extent_rec *split_rec,
4041                   enum ocfs2_split_type split)
4042{
4043    int index;
4044    u32 cpos = le32_to_cpu(split_rec->e_cpos);
4045    struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4046    struct ocfs2_extent_rec *rec, *tmprec;
4047
4048    right_el = path_leaf_el(right_path);
4049    if (left_path)
4050        left_el = path_leaf_el(left_path);
4051
4052    el = right_el;
4053    insert_el = right_el;
4054    index = ocfs2_search_extent_list(el, cpos);
4055    if (index != -1) {
4056        if (index == 0 && left_path) {
4057            BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4058
4059            /*
4060             * This typically means that the record
4061             * started in the left path but moved to the
4062             * right as a result of rotation. We either
4063             * move the existing record to the left, or we
4064             * do the later insert there.
4065             *
4066             * In this case, the left path should always
4067             * exist as the rotate code will have passed
4068             * it back for a post-insert update.
4069             */
4070
4071            if (split == SPLIT_LEFT) {
4072                /*
4073                 * It's a left split. Since we know
4074                 * that the rotate code gave us an
4075                 * empty extent in the left path, we
4076                 * can just do the insert there.
4077                 */
4078                insert_el = left_el;
4079            } else {
4080                /*
4081                 * Right split - we have to move the
4082                 * existing record over to the left
4083                 * leaf. The insert will be into the
4084                 * newly created empty extent in the
4085                 * right leaf.
4086                 */
4087                tmprec = &right_el->l_recs[index];
4088                ocfs2_rotate_leaf(left_el, tmprec);
4089                el = left_el;
4090
4091                memset(tmprec, 0, sizeof(*tmprec));
4092                index = ocfs2_search_extent_list(left_el, cpos);
4093                BUG_ON(index == -1);
4094            }
4095        }
4096    } else {
4097        BUG_ON(!left_path);
4098        BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4099        /*
4100         * Left path is easy - we can just allow the insert to
4101         * happen.
4102         */
4103        el = left_el;
4104        insert_el = left_el;
4105        index = ocfs2_search_extent_list(el, cpos);
4106        BUG_ON(index == -1);
4107    }
4108
4109    rec = &el->l_recs[index];
4110    ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4111                split, rec, split_rec);
4112    ocfs2_rotate_leaf(insert_el, split_rec);
4113}
4114
4115/*
4116 * This function only does inserts on an allocation b-tree. For tree
4117 * depth = 0, ocfs2_insert_at_leaf() is called directly.
4118 *
4119 * right_path is the path we want to do the actual insert
4120 * in. left_path should only be passed in if we need to update that
4121 * portion of the tree after an edge insert.
4122 */
4123static int ocfs2_insert_path(handle_t *handle,
4124                 struct ocfs2_extent_tree *et,
4125                 struct ocfs2_path *left_path,
4126                 struct ocfs2_path *right_path,
4127                 struct ocfs2_extent_rec *insert_rec,
4128                 struct ocfs2_insert_type *insert)
4129{
4130    int ret, subtree_index;
4131    struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4132
4133    if (left_path) {
4134        /*
4135         * There's a chance that left_path got passed back to
4136         * us without being accounted for in the
4137         * journal. Extend our transaction here to be sure we
4138         * can change those blocks.
4139         */
4140        ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4141        if (ret < 0) {
4142            mlog_errno(ret);
4143            goto out;
4144        }
4145
4146        ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4147        if (ret < 0) {
4148            mlog_errno(ret);
4149            goto out;
4150        }
4151    }
4152
4153    /*
4154     * Pass both paths to the journal. The majority of inserts
4155     * will be touching all components anyway.
4156     */
4157    ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4158    if (ret < 0) {
4159        mlog_errno(ret);
4160        goto out;
4161    }
4162
4163    if (insert->ins_split != SPLIT_NONE) {
4164        /*
4165         * We could call ocfs2_insert_at_leaf() for some types
4166         * of splits, but it's easier to just let one separate
4167         * function sort it all out.
4168         */
4169        ocfs2_split_record(et, left_path, right_path,
4170                   insert_rec, insert->ins_split);
4171
4172        /*
4173         * Split might have modified either leaf and we don't
4174         * have a guarantee that the later edge insert will
4175         * dirty this for us.
4176         */
4177        if (left_path)
4178            ocfs2_journal_dirty(handle,
4179                        path_leaf_bh(left_path));
4180    } else
4181        ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4182                     insert);
4183
4184    ocfs2_journal_dirty(handle, leaf_bh);
4185
4186    if (left_path) {
4187        /*
4188         * The rotate code has indicated that we need to fix
4189         * up portions of the tree after the insert.
4190         *
4191         * XXX: Should we extend the transaction here?
4192         */
4193        subtree_index = ocfs2_find_subtree_root(et, left_path,
4194                            right_path);
4195        ocfs2_complete_edge_insert(handle, left_path, right_path,
4196                       subtree_index);
4197    }
4198
4199    ret = 0;
4200out:
4201    return ret;
4202}
4203
4204static int ocfs2_do_insert_extent(handle_t *handle,
4205                  struct ocfs2_extent_tree *et,
4206                  struct ocfs2_extent_rec *insert_rec,
4207                  struct ocfs2_insert_type *type)
4208{
4209    int ret, rotate = 0;
4210    u32 cpos;
4211    struct ocfs2_path *right_path = NULL;
4212    struct ocfs2_path *left_path = NULL;
4213    struct ocfs2_extent_list *el;
4214
4215    el = et->et_root_el;
4216
4217    ret = ocfs2_et_root_journal_access(handle, et,
4218                       OCFS2_JOURNAL_ACCESS_WRITE);
4219    if (ret) {
4220        mlog_errno(ret);
4221        goto out;
4222    }
4223
4224    if (le16_to_cpu(el->l_tree_depth) == 0) {
4225        ocfs2_insert_at_leaf(et, insert_rec, el, type);
4226        goto out_update_clusters;
4227    }
4228
4229    right_path = ocfs2_new_path_from_et(et);
4230    if (!right_path) {
4231        ret = -ENOMEM;
4232        mlog_errno(ret);
4233        goto out;
4234    }
4235
4236    /*
4237     * Determine the path to start with. Rotations need the
4238     * rightmost path, everything else can go directly to the
4239     * target leaf.
4240     */
4241    cpos = le32_to_cpu(insert_rec->e_cpos);
4242    if (type->ins_appending == APPEND_NONE &&
4243        type->ins_contig == CONTIG_NONE) {
4244        rotate = 1;
4245        cpos = UINT_MAX;
4246    }
4247
4248    ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4249    if (ret) {
4250        mlog_errno(ret);
4251        goto out;
4252    }
4253
4254    /*
4255     * Rotations and appends need special treatment - they modify
4256     * parts of the tree's above them.
4257     *
4258     * Both might pass back a path immediate to the left of the
4259     * one being inserted to. This will be cause
4260     * ocfs2_insert_path() to modify the rightmost records of
4261     * left_path to account for an edge insert.
4262     *
4263     * XXX: When modifying this code, keep in mind that an insert
4264     * can wind up skipping both of these two special cases...
4265     */
4266    if (rotate) {
4267        ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4268                          le32_to_cpu(insert_rec->e_cpos),
4269                          right_path, &left_path);
4270        if (ret) {
4271            mlog_errno(ret);
4272            goto out;
4273        }
4274
4275        /*
4276         * ocfs2_rotate_tree_right() might have extended the
4277         * transaction without re-journaling our tree root.
4278         */
4279        ret = ocfs2_et_root_journal_access(handle, et,
4280                           OCFS2_JOURNAL_ACCESS_WRITE);
4281        if (ret) {
4282            mlog_errno(ret);
4283            goto out;
4284        }
4285    } else if (type->ins_appending == APPEND_TAIL
4286           && type->ins_contig != CONTIG_LEFT) {
4287        ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4288                           right_path, &left_path);
4289        if (ret) {
4290            mlog_errno(ret);
4291            goto out;
4292        }
4293    }
4294
4295    ret = ocfs2_insert_path(handle, et, left_path, right_path,
4296                insert_rec, type);
4297    if (ret) {
4298        mlog_errno(ret);
4299        goto out;
4300    }
4301
4302out_update_clusters:
4303    if (type->ins_split == SPLIT_NONE)
4304        ocfs2_et_update_clusters(et,
4305                     le16_to_cpu(insert_rec->e_leaf_clusters));
4306
4307    ocfs2_journal_dirty(handle, et->et_root_bh);
4308
4309out:
4310    ocfs2_free_path(left_path);
4311    ocfs2_free_path(right_path);
4312
4313    return ret;
4314}
4315
4316static enum ocfs2_contig_type
4317ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4318                   struct ocfs2_path *path,
4319                   struct ocfs2_extent_list *el, int index,
4320                   struct ocfs2_extent_rec *split_rec)
4321{
4322    int status;
4323    enum ocfs2_contig_type ret = CONTIG_NONE;
4324    u32 left_cpos, right_cpos;
4325    struct ocfs2_extent_rec *rec = NULL;
4326    struct ocfs2_extent_list *new_el;
4327    struct ocfs2_path *left_path = NULL, *right_path = NULL;
4328    struct buffer_head *bh;
4329    struct ocfs2_extent_block *eb;
4330    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4331
4332    if (index > 0) {
4333        rec = &el->l_recs[index - 1];
4334    } else if (path->p_tree_depth > 0) {
4335        status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4336        if (status)
4337            goto out;
4338
4339        if (left_cpos != 0) {
4340            left_path = ocfs2_new_path_from_path(path);
4341            if (!left_path)
4342                goto out;
4343
4344            status = ocfs2_find_path(et->et_ci, left_path,
4345                         left_cpos);
4346            if (status)
4347                goto out;
4348
4349            new_el = path_leaf_el(left_path);
4350
4351            if (le16_to_cpu(new_el->l_next_free_rec) !=
4352                le16_to_cpu(new_el->l_count)) {
4353                bh = path_leaf_bh(left_path);
4354                eb = (struct ocfs2_extent_block *)bh->b_data;
4355                ocfs2_error(sb,
4356                        "Extent block #%llu has an "
4357                        "invalid l_next_free_rec of "
4358                        "%d. It should have "
4359                        "matched the l_count of %d",
4360                        (unsigned long long)le64_to_cpu(eb->h_blkno),
4361                        le16_to_cpu(new_el->l_next_free_rec),
4362                        le16_to_cpu(new_el->l_count));
4363                status = -EINVAL;
4364                goto out;
4365            }
4366            rec = &new_el->l_recs[
4367                le16_to_cpu(new_el->l_next_free_rec) - 1];
4368        }
4369    }
4370
4371    /*
4372     * We're careful to check for an empty extent record here -
4373     * the merge code will know what to do if it sees one.
4374     */
4375    if (rec) {
4376        if (index == 1 && ocfs2_is_empty_extent(rec)) {
4377            if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4378                ret = CONTIG_RIGHT;
4379        } else {
4380            ret = ocfs2_et_extent_contig(et, rec, split_rec);
4381        }
4382    }
4383
4384    rec = NULL;
4385    if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4386        rec = &el->l_recs[index + 1];
4387    else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4388         path->p_tree_depth > 0) {
4389        status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4390        if (status)
4391            goto out;
4392
4393        if (right_cpos == 0)
4394            goto out;
4395
4396        right_path = ocfs2_new_path_from_path(path);
4397        if (!right_path)
4398            goto out;
4399
4400        status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4401        if (status)
4402            goto out;
4403
4404        new_el = path_leaf_el(right_path);
4405        rec = &new_el->l_recs[0];
4406        if (ocfs2_is_empty_extent(rec)) {
4407            if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4408                bh = path_leaf_bh(right_path);
4409                eb = (struct ocfs2_extent_block *)bh->b_data;
4410                ocfs2_error(sb,
4411                        "Extent block #%llu has an "
4412                        "invalid l_next_free_rec of %d",
4413                        (unsigned long long)le64_to_cpu(eb->h_blkno),
4414                        le16_to_cpu(new_el->l_next_free_rec));
4415                status = -EINVAL;
4416                goto out;
4417            }
4418            rec = &new_el->l_recs[1];
4419        }
4420    }
4421
4422    if (rec) {
4423        enum ocfs2_contig_type contig_type;
4424
4425        contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4426
4427        if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4428            ret = CONTIG_LEFTRIGHT;
4429        else if (ret == CONTIG_NONE)
4430            ret = contig_type;
4431    }
4432
4433out:
4434    if (left_path)
4435        ocfs2_free_path(left_path);
4436    if (right_path)
4437        ocfs2_free_path(right_path);
4438
4439    return ret;
4440}
4441
4442static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4443                     struct ocfs2_insert_type *insert,
4444                     struct ocfs2_extent_list *el,
4445                     struct ocfs2_extent_rec *insert_rec)
4446{
4447    int i;
4448    enum ocfs2_contig_type contig_type = CONTIG_NONE;
4449
4450    BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4451
4452    for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4453        contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4454                             insert_rec);
4455        if (contig_type != CONTIG_NONE) {
4456            insert->ins_contig_index = i;
4457            break;
4458        }
4459    }
4460    insert->ins_contig = contig_type;
4461
4462    if (insert->ins_contig != CONTIG_NONE) {
4463        struct ocfs2_extent_rec *rec =
4464                &el->l_recs[insert->ins_contig_index];
4465        unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4466                   le16_to_cpu(insert_rec->e_leaf_clusters);
4467
4468        /*
4469         * Caller might want us to limit the size of extents, don't
4470         * calculate contiguousness if we might exceed that limit.
4471         */
4472        if (et->et_max_leaf_clusters &&
4473            (len > et->et_max_leaf_clusters))
4474            insert->ins_contig = CONTIG_NONE;
4475    }
4476}
4477
4478/*
4479 * This should only be called against the righmost leaf extent list.
4480 *
4481 * ocfs2_figure_appending_type() will figure out whether we'll have to
4482 * insert at the tail of the rightmost leaf.
4483 *
4484 * This should also work against the root extent list for tree's with 0
4485 * depth. If we consider the root extent list to be the rightmost leaf node
4486 * then the logic here makes sense.
4487 */
4488static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4489                    struct ocfs2_extent_list *el,
4490                    struct ocfs2_extent_rec *insert_rec)
4491{
4492    int i;
4493    u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4494    struct ocfs2_extent_rec *rec;
4495
4496    insert->ins_appending = APPEND_NONE;
4497
4498    BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4499
4500    if (!el->l_next_free_rec)
4501        goto set_tail_append;
4502
4503    if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4504        /* Were all records empty? */
4505        if (le16_to_cpu(el->l_next_free_rec) == 1)
4506            goto set_tail_append;
4507    }
4508
4509    i = le16_to_cpu(el->l_next_free_rec) - 1;
4510    rec = &el->l_recs[i];
4511
4512    if (cpos >=
4513        (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4514        goto set_tail_append;
4515
4516    return;
4517
4518set_tail_append:
4519    insert->ins_appending = APPEND_TAIL;
4520}
4521
4522/*
4523 * Helper function called at the beginning of an insert.
4524 *
4525 * This computes a few things that are commonly used in the process of
4526 * inserting into the btree:
4527 * - Whether the new extent is contiguous with an existing one.
4528 * - The current tree depth.
4529 * - Whether the insert is an appending one.
4530 * - The total # of free records in the tree.
4531 *
4532 * All of the information is stored on the ocfs2_insert_type
4533 * structure.
4534 */
4535static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4536                    struct buffer_head **last_eb_bh,
4537                    struct ocfs2_extent_rec *insert_rec,
4538                    int *free_records,
4539                    struct ocfs2_insert_type *insert)
4540{
4541    int ret;
4542    struct ocfs2_extent_block *eb;
4543    struct ocfs2_extent_list *el;
4544    struct ocfs2_path *path = NULL;
4545    struct buffer_head *bh = NULL;
4546
4547    insert->ins_split = SPLIT_NONE;
4548
4549    el = et->et_root_el;
4550    insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4551
4552    if (el->l_tree_depth) {
4553        /*
4554         * If we have tree depth, we read in the
4555         * rightmost extent block ahead of time as
4556         * ocfs2_figure_insert_type() and ocfs2_add_branch()
4557         * may want it later.
4558         */
4559        ret = ocfs2_read_extent_block(et->et_ci,
4560                          ocfs2_et_get_last_eb_blk(et),
4561                          &bh);
4562        if (ret) {
4563            mlog_errno(ret);
4564            goto out;
4565        }
4566        eb = (struct ocfs2_extent_block *) bh->b_data;
4567        el = &eb->h_list;
4568    }
4569
4570    /*
4571     * Unless we have a contiguous insert, we'll need to know if
4572     * there is room left in our allocation tree for another
4573     * extent record.
4574     *
4575     * XXX: This test is simplistic, we can search for empty
4576     * extent records too.
4577     */
4578    *free_records = le16_to_cpu(el->l_count) -
4579        le16_to_cpu(el->l_next_free_rec);
4580
4581    if (!insert->ins_tree_depth) {
4582        ocfs2_figure_contig_type(et, insert, el, insert_rec);
4583        ocfs2_figure_appending_type(insert, el, insert_rec);
4584        return 0;
4585    }
4586
4587    path = ocfs2_new_path_from_et(et);
4588    if (!path) {
4589        ret = -ENOMEM;
4590        mlog_errno(ret);
4591        goto out;
4592    }
4593
4594    /*
4595     * In the case that we're inserting past what the tree
4596     * currently accounts for, ocfs2_find_path() will return for
4597     * us the rightmost tree path. This is accounted for below in
4598     * the appending code.
4599     */
4600    ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4601    if (ret) {
4602        mlog_errno(ret);
4603        goto out;
4604    }
4605
4606    el = path_leaf_el(path);
4607
4608    /*
4609     * Now that we have the path, there's two things we want to determine:
4610     * 1) Contiguousness (also set contig_index if this is so)
4611     *
4612     * 2) Are we doing an append? We can trivially break this up
4613         * into two types of appends: simple record append, or a
4614         * rotate inside the tail leaf.
4615     */
4616    ocfs2_figure_contig_type(et, insert, el, insert_rec);
4617
4618    /*
4619     * The insert code isn't quite ready to deal with all cases of
4620     * left contiguousness. Specifically, if it's an insert into
4621     * the 1st record in a leaf, it will require the adjustment of
4622     * cluster count on the last record of the path directly to it's
4623     * left. For now, just catch that case and fool the layers
4624     * above us. This works just fine for tree_depth == 0, which
4625     * is why we allow that above.
4626     */
4627    if (insert->ins_contig == CONTIG_LEFT &&
4628        insert->ins_contig_index == 0)
4629        insert->ins_contig = CONTIG_NONE;
4630
4631    /*
4632     * Ok, so we can simply compare against last_eb to figure out
4633     * whether the path doesn't exist. This will only happen in
4634     * the case that we're doing a tail append, so maybe we can
4635     * take advantage of that information somehow.
4636     */
4637    if (ocfs2_et_get_last_eb_blk(et) ==
4638        path_leaf_bh(path)->b_blocknr) {
4639        /*
4640         * Ok, ocfs2_find_path() returned us the rightmost
4641         * tree path. This might be an appending insert. There are
4642         * two cases:
4643         * 1) We're doing a true append at the tail:
4644         * -This might even be off the end of the leaf
4645         * 2) We're "appending" by rotating in the tail
4646         */
4647        ocfs2_figure_appending_type(insert, el, insert_rec);
4648    }
4649
4650out:
4651    ocfs2_free_path(path);
4652
4653    if (ret == 0)
4654        *last_eb_bh = bh;
4655    else
4656        brelse(bh);
4657    return ret;
4658}
4659
4660/*
4661 * Insert an extent into a btree.
4662 *
4663 * The caller needs to update the owning btree's cluster count.
4664 */
4665int ocfs2_insert_extent(handle_t *handle,
4666            struct ocfs2_extent_tree *et,
4667            u32 cpos,
4668            u64 start_blk,
4669            u32 new_clusters,
4670            u8 flags,
4671            struct ocfs2_alloc_context *meta_ac)
4672{
4673    int status;
4674    int uninitialized_var(free_records);
4675    struct buffer_head *last_eb_bh = NULL;
4676    struct ocfs2_insert_type insert = {0, };
4677    struct ocfs2_extent_rec rec;
4678
4679    trace_ocfs2_insert_extent_start(
4680        (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4681        cpos, new_clusters);
4682
4683    memset(&rec, 0, sizeof(rec));
4684    rec.e_cpos = cpu_to_le32(cpos);
4685    rec.e_blkno = cpu_to_le64(start_blk);
4686    rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4687    rec.e_flags = flags;
4688    status = ocfs2_et_insert_check(et, &rec);
4689    if (status) {
4690        mlog_errno(status);
4691        goto bail;
4692    }
4693
4694    status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4695                      &free_records, &insert);
4696    if (status < 0) {
4697        mlog_errno(status);
4698        goto bail;
4699    }
4700
4701    trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
4702                  insert.ins_contig_index, free_records,
4703                  insert.ins_tree_depth);
4704
4705    if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4706        status = ocfs2_grow_tree(handle, et,
4707                     &insert.ins_tree_depth, &last_eb_bh,
4708                     meta_ac);
4709        if (status) {
4710            mlog_errno(status);
4711            goto bail;
4712        }
4713    }
4714
4715    /* Finally, we can add clusters. This might rotate the tree for us. */
4716    status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4717    if (status < 0)
4718        mlog_errno(status);
4719    else
4720        ocfs2_et_extent_map_insert(et, &rec);
4721
4722bail:
4723    brelse(last_eb_bh);
4724
4725    return status;
4726}
4727
4728/*
4729 * Allcate and add clusters into the extent b-tree.
4730 * The new clusters(clusters_to_add) will be inserted at logical_offset.
4731 * The extent b-tree's root is specified by et, and
4732 * it is not limited to the file storage. Any extent tree can use this
4733 * function if it implements the proper ocfs2_extent_tree.
4734 */
4735int ocfs2_add_clusters_in_btree(handle_t *handle,
4736                struct ocfs2_extent_tree *et,
4737                u32 *logical_offset,
4738                u32 clusters_to_add,
4739                int mark_unwritten,
4740                struct ocfs2_alloc_context *data_ac,
4741                struct ocfs2_alloc_context *meta_ac,
4742                enum ocfs2_alloc_restarted *reason_ret)
4743{
4744    int status = 0, err = 0;
4745    int free_extents;
4746    enum ocfs2_alloc_restarted reason = RESTART_NONE;
4747    u32 bit_off, num_bits;
4748    u64 block;
4749    u8 flags = 0;
4750    struct ocfs2_super *osb =
4751        OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4752
4753    BUG_ON(!clusters_to_add);
4754
4755    if (mark_unwritten)
4756        flags = OCFS2_EXT_UNWRITTEN;
4757
4758    free_extents = ocfs2_num_free_extents(osb, et);
4759    if (free_extents < 0) {
4760        status = free_extents;
4761        mlog_errno(status);
4762        goto leave;
4763    }
4764
4765    /* there are two cases which could cause us to EAGAIN in the
4766     * we-need-more-metadata case:
4767     * 1) we haven't reserved *any*
4768     * 2) we are so fragmented, we've needed to add metadata too
4769     * many times. */
4770    if (!free_extents && !meta_ac) {
4771        err = -1;
4772        status = -EAGAIN;
4773        reason = RESTART_META;
4774        goto leave;
4775    } else if ((!free_extents)
4776           && (ocfs2_alloc_context_bits_left(meta_ac)
4777               < ocfs2_extend_meta_needed(et->et_root_el))) {
4778        err = -2;
4779        status = -EAGAIN;
4780        reason = RESTART_META;
4781        goto leave;
4782    }
4783
4784    status = __ocfs2_claim_clusters(handle, data_ac, 1,
4785                    clusters_to_add, &bit_off, &num_bits);
4786    if (status < 0) {
4787        if (status != -ENOSPC)
4788            mlog_errno(status);
4789        goto leave;
4790    }
4791
4792    BUG_ON(num_bits > clusters_to_add);
4793
4794    /* reserve our write early -- insert_extent may update the tree root */
4795    status = ocfs2_et_root_journal_access(handle, et,
4796                          OCFS2_JOURNAL_ACCESS_WRITE);
4797    if (status < 0) {
4798        mlog_errno(status);
4799        goto leave;
4800    }
4801
4802    block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4803    trace_ocfs2_add_clusters_in_btree(
4804         (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4805         bit_off, num_bits);
4806    status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4807                     num_bits, flags, meta_ac);
4808    if (status < 0) {
4809        mlog_errno(status);
4810        goto leave;
4811    }
4812
4813    ocfs2_journal_dirty(handle, et->et_root_bh);
4814
4815    clusters_to_add -= num_bits;
4816    *logical_offset += num_bits;
4817
4818    if (clusters_to_add) {
4819        err = clusters_to_add;
4820        status = -EAGAIN;
4821        reason = RESTART_TRANS;
4822    }
4823
4824leave:
4825    if (reason_ret)
4826        *reason_ret = reason;
4827    trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
4828    return status;
4829}
4830
4831static void ocfs2_make_right_split_rec(struct super_block *sb,
4832                       struct ocfs2_extent_rec *split_rec,
4833                       u32 cpos,
4834                       struct ocfs2_extent_rec *rec)
4835{
4836    u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4837    u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4838
4839    memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4840
4841    split_rec->e_cpos = cpu_to_le32(cpos);
4842    split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4843
4844    split_rec->e_blkno = rec->e_blkno;
4845    le64_add_cpu(&split_rec->e_blkno,
4846             ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4847
4848    split_rec->e_flags = rec->e_flags;
4849}
4850
4851static int ocfs2_split_and_insert(handle_t *handle,
4852                  struct ocfs2_extent_tree *et,
4853                  struct ocfs2_path *path,
4854                  struct buffer_head **last_eb_bh,
4855                  int split_index,
4856                  struct ocfs2_extent_rec *orig_split_rec,
4857                  struct ocfs2_alloc_context *meta_ac)
4858{
4859    int ret = 0, depth;
4860    unsigned int insert_range, rec_range, do_leftright = 0;
4861    struct ocfs2_extent_rec tmprec;
4862    struct ocfs2_extent_list *rightmost_el;
4863    struct ocfs2_extent_rec rec;
4864    struct ocfs2_extent_rec split_rec = *orig_split_rec;
4865    struct ocfs2_insert_type insert;
4866    struct ocfs2_extent_block *eb;
4867
4868leftright:
4869    /*
4870     * Store a copy of the record on the stack - it might move
4871     * around as the tree is manipulated below.
4872     */
4873    rec = path_leaf_el(path)->l_recs[split_index];
4874
4875    rightmost_el = et->et_root_el;
4876
4877    depth = le16_to_cpu(rightmost_el->l_tree_depth);
4878    if (depth) {
4879        BUG_ON(!(*last_eb_bh));
4880        eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4881        rightmost_el = &eb->h_list;
4882    }
4883
4884    if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4885        le16_to_cpu(rightmost_el->l_count)) {
4886        ret = ocfs2_grow_tree(handle, et,
4887                      &depth, last_eb_bh, meta_ac);
4888        if (ret) {
4889            mlog_errno(ret);
4890            goto out;
4891        }
4892    }
4893
4894    memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4895    insert.ins_appending = APPEND_NONE;
4896    insert.ins_contig = CONTIG_NONE;
4897    insert.ins_tree_depth = depth;
4898
4899    insert_range = le32_to_cpu(split_rec.e_cpos) +
4900        le16_to_cpu(split_rec.e_leaf_clusters);
4901    rec_range = le32_to_cpu(rec.e_cpos) +
4902        le16_to_cpu(rec.e_leaf_clusters);
4903
4904    if (split_rec.e_cpos == rec.e_cpos) {
4905        insert.ins_split = SPLIT_LEFT;
4906    } else if (insert_range == rec_range) {
4907        insert.ins_split = SPLIT_RIGHT;
4908    } else {
4909        /*
4910         * Left/right split. We fake this as a right split
4911         * first and then make a second pass as a left split.
4912         */
4913        insert.ins_split = SPLIT_RIGHT;
4914
4915        ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4916                       &tmprec, insert_range, &rec);
4917
4918        split_rec = tmprec;
4919
4920        BUG_ON(do_leftright);
4921        do_leftright = 1;
4922    }
4923
4924    ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4925    if (ret) {
4926        mlog_errno(ret);
4927        goto out;
4928    }
4929
4930    if (do_leftright == 1) {
4931        u32 cpos;
4932        struct ocfs2_extent_list *el;
4933
4934        do_leftright++;
4935        split_rec = *orig_split_rec;
4936
4937        ocfs2_reinit_path(path, 1);
4938
4939        cpos = le32_to_cpu(split_rec.e_cpos);
4940        ret = ocfs2_find_path(et->et_ci, path, cpos);
4941        if (ret) {
4942            mlog_errno(ret);
4943            goto out;
4944        }
4945
4946        el = path_leaf_el(path);
4947        split_index = ocfs2_search_extent_list(el, cpos);
4948        goto leftright;
4949    }
4950out:
4951
4952    return ret;
4953}
4954
4955static int ocfs2_replace_extent_rec(handle_t *handle,
4956                    struct ocfs2_extent_tree *et,
4957                    struct ocfs2_path *path,
4958                    struct ocfs2_extent_list *el,
4959                    int split_index,
4960                    struct ocfs2_extent_rec *split_rec)
4961{
4962    int ret;
4963
4964    ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
4965                       path_num_items(path) - 1);
4966    if (ret) {
4967        mlog_errno(ret);
4968        goto out;
4969    }
4970
4971    el->l_recs[split_index] = *split_rec;
4972
4973    ocfs2_journal_dirty(handle, path_leaf_bh(path));
4974out:
4975    return ret;
4976}
4977
4978/*
4979 * Split part or all of the extent record at split_index in the leaf
4980 * pointed to by path. Merge with the contiguous extent record if needed.
4981 *
4982 * Care is taken to handle contiguousness so as to not grow the tree.
4983 *
4984 * meta_ac is not strictly necessary - we only truly need it if growth
4985 * of the tree is required. All other cases will degrade into a less
4986 * optimal tree layout.
4987 *
4988 * last_eb_bh should be the rightmost leaf block for any extent
4989 * btree. Since a split may grow the tree or a merge might shrink it,
4990 * the caller cannot trust the contents of that buffer after this call.
4991 *
4992 * This code is optimized for readability - several passes might be
4993 * made over certain portions of the tree. All of those blocks will
4994 * have been brought into cache (and pinned via the journal), so the
4995 * extra overhead is not expressed in terms of disk reads.
4996 */
4997int ocfs2_split_extent(handle_t *handle,
4998               struct ocfs2_extent_tree *et,
4999               struct ocfs2_path *path,
5000               int split_index,
5001               struct ocfs2_extent_rec *split_rec,
5002               struct ocfs2_alloc_context *meta_ac,
5003               struct ocfs2_cached_dealloc_ctxt *dealloc)
5004{
5005    int ret = 0;
5006    struct ocfs2_extent_list *el = path_leaf_el(path);
5007    struct buffer_head *last_eb_bh = NULL;
5008    struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5009    struct ocfs2_merge_ctxt ctxt;
5010    struct ocfs2_extent_list *rightmost_el;
5011
5012    if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5013        ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5014         (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5015        ret = -EIO;
5016        mlog_errno(ret);
5017        goto out;
5018    }
5019
5020    ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5021                                split_index,
5022                                split_rec);
5023
5024    /*
5025     * The core merge / split code wants to know how much room is
5026     * left in this allocation tree, so we pass the
5027     * rightmost extent list.
5028     */
5029    if (path->p_tree_depth) {
5030        struct ocfs2_extent_block *eb;
5031
5032        ret = ocfs2_read_extent_block(et->et_ci,
5033                          ocfs2_et_get_last_eb_blk(et),
5034                          &last_eb_bh);
5035        if (ret) {
5036            mlog_errno(ret);
5037            goto out;
5038        }
5039
5040        eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5041        rightmost_el = &eb->h_list;
5042    } else
5043        rightmost_el = path_root_el(path);
5044
5045    if (rec->e_cpos == split_rec->e_cpos &&
5046        rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5047        ctxt.c_split_covers_rec = 1;
5048    else
5049        ctxt.c_split_covers_rec = 0;
5050
5051    ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5052
5053    trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
5054                 ctxt.c_has_empty_extent,
5055                 ctxt.c_split_covers_rec);
5056
5057    if (ctxt.c_contig_type == CONTIG_NONE) {
5058        if (ctxt.c_split_covers_rec)
5059            ret = ocfs2_replace_extent_rec(handle, et, path, el,
5060                               split_index, split_rec);
5061        else
5062            ret = ocfs2_split_and_insert(handle, et, path,
5063                             &last_eb_bh, split_index,
5064                             split_rec, meta_ac);
5065        if (ret)
5066            mlog_errno(ret);
5067    } else {
5068        ret = ocfs2_try_to_merge_extent(handle, et, path,
5069                        split_index, split_rec,
5070                        dealloc, &ctxt);
5071        if (ret)
5072            mlog_errno(ret);
5073    }
5074
5075out:
5076    brelse(last_eb_bh);
5077    return ret;
5078}
5079
5080/*
5081 * Change the flags of the already-existing extent at cpos for len clusters.
5082 *
5083 * new_flags: the flags we want to set.
5084 * clear_flags: the flags we want to clear.
5085 * phys: the new physical offset we want this new extent starts from.
5086 *
5087 * If the existing extent is larger than the request, initiate a
5088 * split. An attempt will be made at merging with adjacent extents.
5089 *
5090 * The caller is responsible for passing down meta_ac if we'll need it.
5091 */
5092int ocfs2_change_extent_flag(handle_t *handle,
5093                 struct ocfs2_extent_tree *et,
5094                 u32 cpos, u32 len, u32 phys,
5095                 struct ocfs2_alloc_context *meta_ac,
5096                 struct ocfs2_cached_dealloc_ctxt *dealloc,
5097                 int new_flags, int clear_flags)
5098{
5099    int ret, index;
5100    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5101    u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5102    struct ocfs2_extent_rec split_rec;
5103    struct ocfs2_path *left_path = NULL;
5104    struct ocfs2_extent_list *el;
5105    struct ocfs2_extent_rec *rec;
5106
5107    left_path = ocfs2_new_path_from_et(et);
5108    if (!left_path) {
5109        ret = -ENOMEM;
5110        mlog_errno(ret);
5111        goto out;
5112    }
5113
5114    ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5115    if (ret) {
5116        mlog_errno(ret);
5117        goto out;
5118    }
5119    el = path_leaf_el(left_path);
5120
5121    index = ocfs2_search_extent_list(el, cpos);
5122    if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5123        ocfs2_error(sb,
5124                "Owner %llu has an extent at cpos %u which can no "
5125                "longer be found.\n",
5126                 (unsigned long long)
5127                 ocfs2_metadata_cache_owner(et->et_ci), cpos);
5128        ret = -EROFS;
5129        goto out;
5130    }
5131
5132    ret = -EIO;
5133    rec = &el->l_recs[index];
5134    if (new_flags && (rec->e_flags & new_flags)) {
5135        mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5136             "extent that already had them",
5137             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5138             new_flags);
5139        goto out;
5140    }
5141
5142    if (clear_flags && !(rec->e_flags & clear_flags)) {
5143        mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5144             "extent that didn't have them",
5145             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5146             clear_flags);
5147        goto out;
5148    }
5149
5150    memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5151    split_rec.e_cpos = cpu_to_le32(cpos);
5152    split_rec.e_leaf_clusters = cpu_to_le16(len);
5153    split_rec.e_blkno = cpu_to_le64(start_blkno);
5154    split_rec.e_flags = rec->e_flags;
5155    if (new_flags)
5156        split_rec.e_flags |= new_flags;
5157    if (clear_flags)
5158        split_rec.e_flags &= ~clear_flags;
5159
5160    ret = ocfs2_split_extent(handle, et, left_path,
5161                 index, &split_rec, meta_ac,
5162                 dealloc);
5163    if (ret)
5164        mlog_errno(ret);
5165
5166out:
5167    ocfs2_free_path(left_path);
5168    return ret;
5169
5170}
5171
5172/*
5173 * Mark the already-existing extent at cpos as written for len clusters.
5174 * This removes the unwritten extent flag.
5175 *
5176 * If the existing extent is larger than the request, initiate a
5177 * split. An attempt will be made at merging with adjacent extents.
5178 *
5179 * The caller is responsible for passing down meta_ac if we'll need it.
5180 */
5181int ocfs2_mark_extent_written(struct inode *inode,
5182                  struct ocfs2_extent_tree *et,
5183                  handle_t *handle, u32 cpos, u32 len, u32 phys,
5184                  struct ocfs2_alloc_context *meta_ac,
5185                  struct ocfs2_cached_dealloc_ctxt *dealloc)
5186{
5187    int ret;
5188
5189    trace_ocfs2_mark_extent_written(
5190        (unsigned long long)OCFS2_I(inode)->ip_blkno,
5191        cpos, len, phys);
5192
5193    if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5194        ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5195                "that are being written to, but the feature bit "
5196                "is not set in the super block.",
5197                (unsigned long long)OCFS2_I(inode)->ip_blkno);
5198        ret = -EROFS;
5199        goto out;
5200    }
5201
5202    /*
5203     * XXX: This should be fixed up so that we just re-insert the
5204     * next extent records.
5205     */
5206    ocfs2_et_extent_map_truncate(et, 0);
5207
5208    ret = ocfs2_change_extent_flag(handle, et, cpos,
5209                       len, phys, meta_ac, dealloc,
5210                       0, OCFS2_EXT_UNWRITTEN);
5211    if (ret)
5212        mlog_errno(ret);
5213
5214out:
5215    return ret;
5216}
5217
5218static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5219                struct ocfs2_path *path,
5220                int index, u32 new_range,
5221                struct ocfs2_alloc_context *meta_ac)
5222{
5223    int ret, depth, credits;
5224    struct buffer_head *last_eb_bh = NULL;
5225    struct ocfs2_extent_block *eb;
5226    struct ocfs2_extent_list *rightmost_el, *el;
5227    struct ocfs2_extent_rec split_rec;
5228    struct ocfs2_extent_rec *rec;
5229    struct ocfs2_insert_type insert;
5230
5231    /*
5232     * Setup the record to split before we grow the tree.
5233     */
5234    el = path_leaf_el(path);
5235    rec = &el->l_recs[index];
5236    ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5237                   &split_rec, new_range, rec);
5238
5239    depth = path->p_tree_depth;
5240    if (depth > 0) {
5241        ret = ocfs2_read_extent_block(et->et_ci,
5242                          ocfs2_et_get_last_eb_blk(et),
5243                          &last_eb_bh);
5244        if (ret < 0) {
5245            mlog_errno(ret);
5246            goto out;
5247        }
5248
5249        eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5250        rightmost_el = &eb->h_list;
5251    } else
5252        rightmost_el = path_leaf_el(path);
5253
5254    credits = path->p_tree_depth +
5255          ocfs2_extend_meta_needed(et->et_root_el);
5256    ret = ocfs2_extend_trans(handle, credits);
5257    if (ret) {
5258        mlog_errno(ret);
5259        goto out;
5260    }
5261
5262    if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5263        le16_to_cpu(rightmost_el->l_count)) {
5264        ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5265                      meta_ac);
5266        if (ret) {
5267            mlog_errno(ret);
5268            goto out;
5269        }
5270    }
5271
5272    memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5273    insert.ins_appending = APPEND_NONE;
5274    insert.ins_contig = CONTIG_NONE;
5275    insert.ins_split = SPLIT_RIGHT;
5276    insert.ins_tree_depth = depth;
5277
5278    ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5279    if (ret)
5280        mlog_errno(ret);
5281
5282out:
5283    brelse(last_eb_bh);
5284    return ret;
5285}
5286
5287static int ocfs2_truncate_rec(handle_t *handle,
5288                  struct ocfs2_extent_tree *et,
5289                  struct ocfs2_path *path, int index,
5290                  struct ocfs2_cached_dealloc_ctxt *dealloc,
5291                  u32 cpos, u32 len)
5292{
5293    int ret;
5294    u32 left_cpos, rec_range, trunc_range;
5295    int wants_rotate = 0, is_rightmost_tree_rec = 0;
5296    struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5297    struct ocfs2_path *left_path = NULL;
5298    struct ocfs2_extent_list *el = path_leaf_el(path);
5299    struct ocfs2_extent_rec *rec;
5300    struct ocfs2_extent_block *eb;
5301
5302    if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5303        ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5304        if (ret) {
5305            mlog_errno(ret);
5306            goto out;
5307        }
5308
5309        index--;
5310    }
5311
5312    if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5313        path->p_tree_depth) {
5314        /*
5315         * Check whether this is the rightmost tree record. If
5316         * we remove all of this record or part of its right
5317         * edge then an update of the record lengths above it
5318         * will be required.
5319         */
5320        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5321        if (eb->h_next_leaf_blk == 0)
5322            is_rightmost_tree_rec = 1;
5323    }
5324
5325    rec = &el->l_recs[index];
5326    if (index == 0 && path->p_tree_depth &&
5327        le32_to_cpu(rec->e_cpos) == cpos) {
5328        /*
5329         * Changing the leftmost offset (via partial or whole
5330         * record truncate) of an interior (or rightmost) path
5331         * means we have to update the subtree that is formed
5332         * by this leaf and the one to it's left.
5333         *
5334         * There are two cases we can skip:
5335         * 1) Path is the leftmost one in our btree.
5336         * 2) The leaf is rightmost and will be empty after
5337         * we remove the extent record - the rotate code
5338         * knows how to update the newly formed edge.
5339         */
5340
5341        ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5342        if (ret) {
5343            mlog_errno(ret);
5344            goto out;
5345        }
5346
5347        if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5348            left_path = ocfs2_new_path_from_path(path);
5349            if (!left_path) {
5350                ret = -ENOMEM;
5351                mlog_errno(ret);
5352                goto out;
5353            }
5354
5355            ret = ocfs2_find_path(et->et_ci, left_path,
5356                          left_cpos);
5357            if (ret) {
5358                mlog_errno(ret);
5359                goto out;
5360            }
5361        }
5362    }
5363
5364    ret = ocfs2_extend_rotate_transaction(handle, 0,
5365                          handle->h_buffer_credits,
5366                          path);
5367    if (ret) {
5368        mlog_errno(ret);
5369        goto out;
5370    }
5371
5372    ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5373    if (ret) {
5374        mlog_errno(ret);
5375        goto out;
5376    }
5377
5378    ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5379    if (ret) {
5380        mlog_errno(ret);
5381        goto out;
5382    }
5383
5384    rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5385    trunc_range = cpos + len;
5386
5387    if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5388        int next_free;
5389
5390        memset(rec, 0, sizeof(*rec));
5391        ocfs2_cleanup_merge(el, index);
5392        wants_rotate = 1;
5393
5394        next_free = le16_to_cpu(el->l_next_free_rec);
5395        if (is_rightmost_tree_rec && next_free > 1) {
5396            /*
5397             * We skip the edge update if this path will
5398             * be deleted by the rotate code.
5399             */
5400            rec = &el->l_recs[next_free - 1];
5401            ocfs2_adjust_rightmost_records(handle, et, path,
5402                               rec);
5403        }
5404    } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5405        /* Remove leftmost portion of the record. */
5406        le32_add_cpu(&rec->e_cpos, len);
5407        le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5408        le16_add_cpu(&rec->e_leaf_clusters, -len);
5409    } else if (rec_range == trunc_range) {
5410        /* Remove rightmost portion of the record */
5411        le16_add_cpu(&rec->e_leaf_clusters, -len);
5412        if (is_rightmost_tree_rec)
5413            ocfs2_adjust_rightmost_records(handle, et, path, rec);
5414    } else {
5415        /* Caller should have trapped this. */
5416        mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5417             "(%u, %u)\n",
5418             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5419             le32_to_cpu(rec->e_cpos),
5420             le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5421        BUG();
5422    }
5423
5424    if (left_path) {
5425        int subtree_index;
5426
5427        subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5428        ocfs2_complete_edge_insert(handle, left_path, path,
5429                       subtree_index);
5430    }
5431
5432    ocfs2_journal_dirty(handle, path_leaf_bh(path));
5433
5434    ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5435    if (ret) {
5436        mlog_errno(ret);
5437        goto out;
5438    }
5439
5440out:
5441    ocfs2_free_path(left_path);
5442    return ret;
5443}
5444
5445int ocfs2_remove_extent(handle_t *handle,
5446            struct ocfs2_extent_tree *et,
5447            u32 cpos, u32 len,
5448            struct ocfs2_alloc_context *meta_ac,
5449            struct ocfs2_cached_dealloc_ctxt *dealloc)
5450{
5451    int ret, index;
5452    u32 rec_range, trunc_range;
5453    struct ocfs2_extent_rec *rec;
5454    struct ocfs2_extent_list *el;
5455    struct ocfs2_path *path = NULL;
5456
5457    /*
5458     * XXX: Why are we truncating to 0 instead of wherever this
5459     * affects us?
5460     */
5461    ocfs2_et_extent_map_truncate(et, 0);
5462
5463    path = ocfs2_new_path_from_et(et);
5464    if (!path) {
5465        ret = -ENOMEM;
5466        mlog_errno(ret);
5467        goto out;
5468    }
5469
5470    ret = ocfs2_find_path(et->et_ci, path, cpos);
5471    if (ret) {
5472        mlog_errno(ret);
5473        goto out;
5474    }
5475
5476    el = path_leaf_el(path);
5477    index = ocfs2_search_extent_list(el, cpos);
5478    if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5479        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5480                "Owner %llu has an extent at cpos %u which can no "
5481                "longer be found.\n",
5482                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5483                cpos);
5484        ret = -EROFS;
5485        goto out;
5486    }
5487
5488    /*
5489     * We have 3 cases of extent removal:
5490     * 1) Range covers the entire extent rec
5491     * 2) Range begins or ends on one edge of the extent rec
5492     * 3) Range is in the middle of the extent rec (no shared edges)
5493     *
5494     * For case 1 we remove the extent rec and left rotate to
5495     * fill the hole.
5496     *
5497     * For case 2 we just shrink the existing extent rec, with a
5498     * tree update if the shrinking edge is also the edge of an
5499     * extent block.
5500     *
5501     * For case 3 we do a right split to turn the extent rec into
5502     * something case 2 can handle.
5503     */
5504    rec = &el->l_recs[index];
5505    rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5506    trunc_range = cpos + len;
5507
5508    BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5509
5510    trace_ocfs2_remove_extent(
5511        (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5512        cpos, len, index, le32_to_cpu(rec->e_cpos),
5513        ocfs2_rec_clusters(el, rec));
5514
5515    if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5516        ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5517                     cpos, len);
5518        if (ret) {
5519            mlog_errno(ret);
5520            goto out;
5521        }
5522    } else {
5523        ret = ocfs2_split_tree(handle, et, path, index,
5524                       trunc_range, meta_ac);
5525        if (ret) {
5526            mlog_errno(ret);
5527            goto out;
5528        }
5529
5530        /*
5531         * The split could have manipulated the tree enough to
5532         * move the record location, so we have to look for it again.
5533         */
5534        ocfs2_reinit_path(path, 1);
5535
5536        ret = ocfs2_find_path(et->et_ci, path, cpos);
5537        if (ret) {
5538            mlog_errno(ret);
5539            goto out;
5540        }
5541
5542        el = path_leaf_el(path);
5543        index = ocfs2_search_extent_list(el, cpos);
5544        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5545            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5546                    "Owner %llu: split at cpos %u lost record.",
5547                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5548                    cpos);
5549            ret = -EROFS;
5550            goto out;
5551        }
5552
5553        /*
5554         * Double check our values here. If anything is fishy,
5555         * it's easier to catch it at the top level.
5556         */
5557        rec = &el->l_recs[index];
5558        rec_range = le32_to_cpu(rec->e_cpos) +
5559            ocfs2_rec_clusters(el, rec);
5560        if (rec_range != trunc_range) {
5561            ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5562                    "Owner %llu: error after split at cpos %u"
5563                    "trunc len %u, existing record is (%u,%u)",
5564                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5565                    cpos, len, le32_to_cpu(rec->e_cpos),
5566                    ocfs2_rec_clusters(el, rec));
5567            ret = -EROFS;
5568            goto out;
5569        }
5570
5571        ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5572                     cpos, len);
5573        if (ret) {
5574            mlog_errno(ret);
5575            goto out;
5576        }
5577    }
5578
5579out:
5580    ocfs2_free_path(path);
5581    return ret;
5582}
5583
5584/*
5585 * ocfs2_reserve_blocks_for_rec_trunc() would look basically the
5586 * same as ocfs2_lock_alloctors(), except for it accepts a blocks
5587 * number to reserve some extra blocks, and it only handles meta
5588 * data allocations.
5589 *
5590 * Currently, only ocfs2_remove_btree_range() uses it for truncating
5591 * and punching holes.
5592 */
5593static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
5594                          struct ocfs2_extent_tree *et,
5595                          u32 extents_to_split,
5596                          struct ocfs2_alloc_context **ac,
5597                          int extra_blocks)