Root/fs/buffer.c

1/*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
6
7/*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 *
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 *
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */
20
21#include <linux/kernel.h>
22#include <linux/syscalls.h>
23#include <linux/fs.h>
24#include <linux/mm.h>
25#include <linux/percpu.h>
26#include <linux/slab.h>
27#include <linux/capability.h>
28#include <linux/blkdev.h>
29#include <linux/file.h>
30#include <linux/quotaops.h>
31#include <linux/highmem.h>
32#include <linux/module.h>
33#include <linux/writeback.h>
34#include <linux/hash.h>
35#include <linux/suspend.h>
36#include <linux/buffer_head.h>
37#include <linux/task_io_accounting_ops.h>
38#include <linux/bio.h>
39#include <linux/notifier.h>
40#include <linux/cpu.h>
41#include <linux/bitops.h>
42#include <linux/mpage.h>
43#include <linux/bit_spinlock.h>
44#include <linux/cleancache.h>
45
46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47
48#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
49
50inline void
51init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
52{
53    bh->b_end_io = handler;
54    bh->b_private = private;
55}
56EXPORT_SYMBOL(init_buffer);
57
58static int sleep_on_buffer(void *word)
59{
60    io_schedule();
61    return 0;
62}
63
64void __lock_buffer(struct buffer_head *bh)
65{
66    wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
67                            TASK_UNINTERRUPTIBLE);
68}
69EXPORT_SYMBOL(__lock_buffer);
70
71void unlock_buffer(struct buffer_head *bh)
72{
73    clear_bit_unlock(BH_Lock, &bh->b_state);
74    smp_mb__after_clear_bit();
75    wake_up_bit(&bh->b_state, BH_Lock);
76}
77EXPORT_SYMBOL(unlock_buffer);
78
79/*
80 * Block until a buffer comes unlocked. This doesn't stop it
81 * from becoming locked again - you have to lock it yourself
82 * if you want to preserve its state.
83 */
84void __wait_on_buffer(struct buffer_head * bh)
85{
86    wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
87}
88EXPORT_SYMBOL(__wait_on_buffer);
89
90static void
91__clear_page_buffers(struct page *page)
92{
93    ClearPagePrivate(page);
94    set_page_private(page, 0);
95    page_cache_release(page);
96}
97
98
99static int quiet_error(struct buffer_head *bh)
100{
101    if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
102        return 0;
103    return 1;
104}
105
106
107static void buffer_io_error(struct buffer_head *bh)
108{
109    char b[BDEVNAME_SIZE];
110    printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
111            bdevname(bh->b_bdev, b),
112            (unsigned long long)bh->b_blocknr);
113}
114
115/*
116 * End-of-IO handler helper function which does not touch the bh after
117 * unlocking it.
118 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
119 * a race there is benign: unlock_buffer() only use the bh's address for
120 * hashing after unlocking the buffer, so it doesn't actually touch the bh
121 * itself.
122 */
123static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
124{
125    if (uptodate) {
126        set_buffer_uptodate(bh);
127    } else {
128        /* This happens, due to failed READA attempts. */
129        clear_buffer_uptodate(bh);
130    }
131    unlock_buffer(bh);
132}
133
134/*
135 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
136 * unlock the buffer. This is what ll_rw_block uses too.
137 */
138void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
139{
140    __end_buffer_read_notouch(bh, uptodate);
141    put_bh(bh);
142}
143EXPORT_SYMBOL(end_buffer_read_sync);
144
145void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
146{
147    char b[BDEVNAME_SIZE];
148
149    if (uptodate) {
150        set_buffer_uptodate(bh);
151    } else {
152        if (!quiet_error(bh)) {
153            buffer_io_error(bh);
154            printk(KERN_WARNING "lost page write due to "
155                    "I/O error on %s\n",
156                       bdevname(bh->b_bdev, b));
157        }
158        set_buffer_write_io_error(bh);
159        clear_buffer_uptodate(bh);
160    }
161    unlock_buffer(bh);
162    put_bh(bh);
163}
164EXPORT_SYMBOL(end_buffer_write_sync);
165
166/*
167 * Various filesystems appear to want __find_get_block to be non-blocking.
168 * But it's the page lock which protects the buffers. To get around this,
169 * we get exclusion from try_to_free_buffers with the blockdev mapping's
170 * private_lock.
171 *
172 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
173 * may be quite high. This code could TryLock the page, and if that
174 * succeeds, there is no need to take private_lock. (But if
175 * private_lock is contended then so is mapping->tree_lock).
176 */
177static struct buffer_head *
178__find_get_block_slow(struct block_device *bdev, sector_t block)
179{
180    struct inode *bd_inode = bdev->bd_inode;
181    struct address_space *bd_mapping = bd_inode->i_mapping;
182    struct buffer_head *ret = NULL;
183    pgoff_t index;
184    struct buffer_head *bh;
185    struct buffer_head *head;
186    struct page *page;
187    int all_mapped = 1;
188
189    index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
190    page = find_get_page(bd_mapping, index);
191    if (!page)
192        goto out;
193
194    spin_lock(&bd_mapping->private_lock);
195    if (!page_has_buffers(page))
196        goto out_unlock;
197    head = page_buffers(page);
198    bh = head;
199    do {
200        if (!buffer_mapped(bh))
201            all_mapped = 0;
202        else if (bh->b_blocknr == block) {
203            ret = bh;
204            get_bh(bh);
205            goto out_unlock;
206        }
207        bh = bh->b_this_page;
208    } while (bh != head);
209
210    /* we might be here because some of the buffers on this page are
211     * not mapped. This is due to various races between
212     * file io on the block device and getblk. It gets dealt with
213     * elsewhere, don't buffer_error if we had some unmapped buffers
214     */
215    if (all_mapped) {
216        printk("__find_get_block_slow() failed. "
217            "block=%llu, b_blocknr=%llu\n",
218            (unsigned long long)block,
219            (unsigned long long)bh->b_blocknr);
220        printk("b_state=0x%08lx, b_size=%zu\n",
221            bh->b_state, bh->b_size);
222        printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
223    }
224out_unlock:
225    spin_unlock(&bd_mapping->private_lock);
226    page_cache_release(page);
227out:
228    return ret;
229}
230
231/* If invalidate_buffers() will trash dirty buffers, it means some kind
232   of fs corruption is going on. Trashing dirty data always imply losing
233   information that was supposed to be just stored on the physical layer
234   by the user.
235
236   Thus invalidate_buffers in general usage is not allwowed to trash
237   dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
238   be preserved. These buffers are simply skipped.
239  
240   We also skip buffers which are still in use. For example this can
241   happen if a userspace program is reading the block device.
242
243   NOTE: In the case where the user removed a removable-media-disk even if
244   there's still dirty data not synced on disk (due a bug in the device driver
245   or due an error of the user), by not destroying the dirty buffers we could
246   generate corruption also on the next media inserted, thus a parameter is
247   necessary to handle this case in the most safe way possible (trying
248   to not corrupt also the new disk inserted with the data belonging to
249   the old now corrupted disk). Also for the ramdisk the natural thing
250   to do in order to release the ramdisk memory is to destroy dirty buffers.
251
252   These are two special cases. Normal usage imply the device driver
253   to issue a sync on the device (without waiting I/O completion) and
254   then an invalidate_buffers call that doesn't trash dirty buffers.
255
256   For handling cache coherency with the blkdev pagecache the 'update' case
257   is been introduced. It is needed to re-read from disk any pinned
258   buffer. NOTE: re-reading from disk is destructive so we can do it only
259   when we assume nobody is changing the buffercache under our I/O and when
260   we think the disk contains more recent information than the buffercache.
261   The update == 1 pass marks the buffers we need to update, the update == 2
262   pass does the actual I/O. */
263void invalidate_bdev(struct block_device *bdev)
264{
265    struct address_space *mapping = bdev->bd_inode->i_mapping;
266
267    if (mapping->nrpages == 0)
268        return;
269
270    invalidate_bh_lrus();
271    lru_add_drain_all(); /* make sure all lru add caches are flushed */
272    invalidate_mapping_pages(mapping, 0, -1);
273    /* 99% of the time, we don't need to flush the cleancache on the bdev.
274     * But, for the strange corners, lets be cautious
275     */
276    cleancache_flush_inode(mapping);
277}
278EXPORT_SYMBOL(invalidate_bdev);
279
280/*
281 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
282 */
283static void free_more_memory(void)
284{
285    struct zone *zone;
286    int nid;
287
288    wakeup_flusher_threads(1024);
289    yield();
290
291    for_each_online_node(nid) {
292        (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
293                        gfp_zone(GFP_NOFS), NULL,
294                        &zone);
295        if (zone)
296            try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
297                        GFP_NOFS, NULL);
298    }
299}
300
301/*
302 * I/O completion handler for block_read_full_page() - pages
303 * which come unlocked at the end of I/O.
304 */
305static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
306{
307    unsigned long flags;
308    struct buffer_head *first;
309    struct buffer_head *tmp;
310    struct page *page;
311    int page_uptodate = 1;
312
313    BUG_ON(!buffer_async_read(bh));
314
315    page = bh->b_page;
316    if (uptodate) {
317        set_buffer_uptodate(bh);
318    } else {
319        clear_buffer_uptodate(bh);
320        if (!quiet_error(bh))
321            buffer_io_error(bh);
322        SetPageError(page);
323    }
324
325    /*
326     * Be _very_ careful from here on. Bad things can happen if
327     * two buffer heads end IO at almost the same time and both
328     * decide that the page is now completely done.
329     */
330    first = page_buffers(page);
331    local_irq_save(flags);
332    bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
333    clear_buffer_async_read(bh);
334    unlock_buffer(bh);
335    tmp = bh;
336    do {
337        if (!buffer_uptodate(tmp))
338            page_uptodate = 0;
339        if (buffer_async_read(tmp)) {
340            BUG_ON(!buffer_locked(tmp));
341            goto still_busy;
342        }
343        tmp = tmp->b_this_page;
344    } while (tmp != bh);
345    bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
346    local_irq_restore(flags);
347
348    /*
349     * If none of the buffers had errors and they are all
350     * uptodate then we can set the page uptodate.
351     */
352    if (page_uptodate && !PageError(page))
353        SetPageUptodate(page);
354    unlock_page(page);
355    return;
356
357still_busy:
358    bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
359    local_irq_restore(flags);
360    return;
361}
362
363/*
364 * Completion handler for block_write_full_page() - pages which are unlocked
365 * during I/O, and which have PageWriteback cleared upon I/O completion.
366 */
367void end_buffer_async_write(struct buffer_head *bh, int uptodate)
368{
369    char b[BDEVNAME_SIZE];
370    unsigned long flags;
371    struct buffer_head *first;
372    struct buffer_head *tmp;
373    struct page *page;
374
375    BUG_ON(!buffer_async_write(bh));
376
377    page = bh->b_page;
378    if (uptodate) {
379        set_buffer_uptodate(bh);
380    } else {
381        if (!quiet_error(bh)) {
382            buffer_io_error(bh);
383            printk(KERN_WARNING "lost page write due to "
384                    "I/O error on %s\n",
385                   bdevname(bh->b_bdev, b));
386        }
387        set_bit(AS_EIO, &page->mapping->flags);
388        set_buffer_write_io_error(bh);
389        clear_buffer_uptodate(bh);
390        SetPageError(page);
391    }
392
393    first = page_buffers(page);
394    local_irq_save(flags);
395    bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
396
397    clear_buffer_async_write(bh);
398    unlock_buffer(bh);
399    tmp = bh->b_this_page;
400    while (tmp != bh) {
401        if (buffer_async_write(tmp)) {
402            BUG_ON(!buffer_locked(tmp));
403            goto still_busy;
404        }
405        tmp = tmp->b_this_page;
406    }
407    bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
408    local_irq_restore(flags);
409    end_page_writeback(page);
410    return;
411
412still_busy:
413    bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
414    local_irq_restore(flags);
415    return;
416}
417EXPORT_SYMBOL(end_buffer_async_write);
418
419/*
420 * If a page's buffers are under async readin (end_buffer_async_read
421 * completion) then there is a possibility that another thread of
422 * control could lock one of the buffers after it has completed
423 * but while some of the other buffers have not completed. This
424 * locked buffer would confuse end_buffer_async_read() into not unlocking
425 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
426 * that this buffer is not under async I/O.
427 *
428 * The page comes unlocked when it has no locked buffer_async buffers
429 * left.
430 *
431 * PageLocked prevents anyone starting new async I/O reads any of
432 * the buffers.
433 *
434 * PageWriteback is used to prevent simultaneous writeout of the same
435 * page.
436 *
437 * PageLocked prevents anyone from starting writeback of a page which is
438 * under read I/O (PageWriteback is only ever set against a locked page).
439 */
440static void mark_buffer_async_read(struct buffer_head *bh)
441{
442    bh->b_end_io = end_buffer_async_read;
443    set_buffer_async_read(bh);
444}
445
446static void mark_buffer_async_write_endio(struct buffer_head *bh,
447                      bh_end_io_t *handler)
448{
449    bh->b_end_io = handler;
450    set_buffer_async_write(bh);
451}
452
453void mark_buffer_async_write(struct buffer_head *bh)
454{
455    mark_buffer_async_write_endio(bh, end_buffer_async_write);
456}
457EXPORT_SYMBOL(mark_buffer_async_write);
458
459
460/*
461 * fs/buffer.c contains helper functions for buffer-backed address space's
462 * fsync functions. A common requirement for buffer-based filesystems is
463 * that certain data from the backing blockdev needs to be written out for
464 * a successful fsync(). For example, ext2 indirect blocks need to be
465 * written back and waited upon before fsync() returns.
466 *
467 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
468 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
469 * management of a list of dependent buffers at ->i_mapping->private_list.
470 *
471 * Locking is a little subtle: try_to_free_buffers() will remove buffers
472 * from their controlling inode's queue when they are being freed. But
473 * try_to_free_buffers() will be operating against the *blockdev* mapping
474 * at the time, not against the S_ISREG file which depends on those buffers.
475 * So the locking for private_list is via the private_lock in the address_space
476 * which backs the buffers. Which is different from the address_space
477 * against which the buffers are listed. So for a particular address_space,
478 * mapping->private_lock does *not* protect mapping->private_list! In fact,
479 * mapping->private_list will always be protected by the backing blockdev's
480 * ->private_lock.
481 *
482 * Which introduces a requirement: all buffers on an address_space's
483 * ->private_list must be from the same address_space: the blockdev's.
484 *
485 * address_spaces which do not place buffers at ->private_list via these
486 * utility functions are free to use private_lock and private_list for
487 * whatever they want. The only requirement is that list_empty(private_list)
488 * be true at clear_inode() time.
489 *
490 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
491 * filesystems should do that. invalidate_inode_buffers() should just go
492 * BUG_ON(!list_empty).
493 *
494 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
495 * take an address_space, not an inode. And it should be called
496 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
497 * queued up.
498 *
499 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
500 * list if it is already on a list. Because if the buffer is on a list,
501 * it *must* already be on the right one. If not, the filesystem is being
502 * silly. This will save a ton of locking. But first we have to ensure
503 * that buffers are taken *off* the old inode's list when they are freed
504 * (presumably in truncate). That requires careful auditing of all
505 * filesystems (do it inside bforget()). It could also be done by bringing
506 * b_inode back.
507 */
508
509/*
510 * The buffer's backing address_space's private_lock must be held
511 */
512static void __remove_assoc_queue(struct buffer_head *bh)
513{
514    list_del_init(&bh->b_assoc_buffers);
515    WARN_ON(!bh->b_assoc_map);
516    if (buffer_write_io_error(bh))
517        set_bit(AS_EIO, &bh->b_assoc_map->flags);
518    bh->b_assoc_map = NULL;
519}
520
521int inode_has_buffers(struct inode *inode)
522{
523    return !list_empty(&inode->i_data.private_list);
524}
525
526/*
527 * osync is designed to support O_SYNC io. It waits synchronously for
528 * all already-submitted IO to complete, but does not queue any new
529 * writes to the disk.
530 *
531 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
532 * you dirty the buffers, and then use osync_inode_buffers to wait for
533 * completion. Any other dirty buffers which are not yet queued for
534 * write will not be flushed to disk by the osync.
535 */
536static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
537{
538    struct buffer_head *bh;
539    struct list_head *p;
540    int err = 0;
541
542    spin_lock(lock);
543repeat:
544    list_for_each_prev(p, list) {
545        bh = BH_ENTRY(p);
546        if (buffer_locked(bh)) {
547            get_bh(bh);
548            spin_unlock(lock);
549            wait_on_buffer(bh);
550            if (!buffer_uptodate(bh))
551                err = -EIO;
552            brelse(bh);
553            spin_lock(lock);
554            goto repeat;
555        }
556    }
557    spin_unlock(lock);
558    return err;
559}
560
561static void do_thaw_one(struct super_block *sb, void *unused)
562{
563    char b[BDEVNAME_SIZE];
564    while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
565        printk(KERN_WARNING "Emergency Thaw on %s\n",
566               bdevname(sb->s_bdev, b));
567}
568
569static void do_thaw_all(struct work_struct *work)
570{
571    iterate_supers(do_thaw_one, NULL);
572    kfree(work);
573    printk(KERN_WARNING "Emergency Thaw complete\n");
574}
575
576/**
577 * emergency_thaw_all -- forcibly thaw every frozen filesystem
578 *
579 * Used for emergency unfreeze of all filesystems via SysRq
580 */
581void emergency_thaw_all(void)
582{
583    struct work_struct *work;
584
585    work = kmalloc(sizeof(*work), GFP_ATOMIC);
586    if (work) {
587        INIT_WORK(work, do_thaw_all);
588        schedule_work(work);
589    }
590}
591
592/**
593 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
594 * @mapping: the mapping which wants those buffers written
595 *
596 * Starts I/O against the buffers at mapping->private_list, and waits upon
597 * that I/O.
598 *
599 * Basically, this is a convenience function for fsync().
600 * @mapping is a file or directory which needs those buffers to be written for
601 * a successful fsync().
602 */
603int sync_mapping_buffers(struct address_space *mapping)
604{
605    struct address_space *buffer_mapping = mapping->assoc_mapping;
606
607    if (buffer_mapping == NULL || list_empty(&mapping->private_list))
608        return 0;
609
610    return fsync_buffers_list(&buffer_mapping->private_lock,
611                    &mapping->private_list);
612}
613EXPORT_SYMBOL(sync_mapping_buffers);
614
615/*
616 * Called when we've recently written block `bblock', and it is known that
617 * `bblock' was for a buffer_boundary() buffer. This means that the block at
618 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
619 * dirty, schedule it for IO. So that indirects merge nicely with their data.
620 */
621void write_boundary_block(struct block_device *bdev,
622            sector_t bblock, unsigned blocksize)
623{
624    struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
625    if (bh) {
626        if (buffer_dirty(bh))
627            ll_rw_block(WRITE, 1, &bh);
628        put_bh(bh);
629    }
630}
631
632void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
633{
634    struct address_space *mapping = inode->i_mapping;
635    struct address_space *buffer_mapping = bh->b_page->mapping;
636
637    mark_buffer_dirty(bh);
638    if (!mapping->assoc_mapping) {
639        mapping->assoc_mapping = buffer_mapping;
640    } else {
641        BUG_ON(mapping->assoc_mapping != buffer_mapping);
642    }
643    if (!bh->b_assoc_map) {
644        spin_lock(&buffer_mapping->private_lock);
645        list_move_tail(&bh->b_assoc_buffers,
646                &mapping->private_list);
647        bh->b_assoc_map = mapping;
648        spin_unlock(&buffer_mapping->private_lock);
649    }
650}
651EXPORT_SYMBOL(mark_buffer_dirty_inode);
652
653/*
654 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
655 * dirty.
656 *
657 * If warn is true, then emit a warning if the page is not uptodate and has
658 * not been truncated.
659 */
660static void __set_page_dirty(struct page *page,
661        struct address_space *mapping, int warn)
662{
663    spin_lock_irq(&mapping->tree_lock);
664    if (page->mapping) { /* Race with truncate? */
665        WARN_ON_ONCE(warn && !PageUptodate(page));
666        account_page_dirtied(page, mapping);
667        radix_tree_tag_set(&mapping->page_tree,
668                page_index(page), PAGECACHE_TAG_DIRTY);
669    }
670    spin_unlock_irq(&mapping->tree_lock);
671    __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
672}
673
674/*
675 * Add a page to the dirty page list.
676 *
677 * It is a sad fact of life that this function is called from several places
678 * deeply under spinlocking. It may not sleep.
679 *
680 * If the page has buffers, the uptodate buffers are set dirty, to preserve
681 * dirty-state coherency between the page and the buffers. It the page does
682 * not have buffers then when they are later attached they will all be set
683 * dirty.
684 *
685 * The buffers are dirtied before the page is dirtied. There's a small race
686 * window in which a writepage caller may see the page cleanness but not the
687 * buffer dirtiness. That's fine. If this code were to set the page dirty
688 * before the buffers, a concurrent writepage caller could clear the page dirty
689 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
690 * page on the dirty page list.
691 *
692 * We use private_lock to lock against try_to_free_buffers while using the
693 * page's buffer list. Also use this to protect against clean buffers being
694 * added to the page after it was set dirty.
695 *
696 * FIXME: may need to call ->reservepage here as well. That's rather up to the
697 * address_space though.
698 */
699int __set_page_dirty_buffers(struct page *page)
700{
701    int newly_dirty;
702    struct address_space *mapping = page_mapping(page);
703
704    if (unlikely(!mapping))
705        return !TestSetPageDirty(page);
706
707    spin_lock(&mapping->private_lock);
708    if (page_has_buffers(page)) {
709        struct buffer_head *head = page_buffers(page);
710        struct buffer_head *bh = head;
711
712        do {
713            set_buffer_dirty(bh);
714            bh = bh->b_this_page;
715        } while (bh != head);
716    }
717    newly_dirty = !TestSetPageDirty(page);
718    spin_unlock(&mapping->private_lock);
719
720    if (newly_dirty)
721        __set_page_dirty(page, mapping, 1);
722    return newly_dirty;
723}
724EXPORT_SYMBOL(__set_page_dirty_buffers);
725
726/*
727 * Write out and wait upon a list of buffers.
728 *
729 * We have conflicting pressures: we want to make sure that all
730 * initially dirty buffers get waited on, but that any subsequently
731 * dirtied buffers don't. After all, we don't want fsync to last
732 * forever if somebody is actively writing to the file.
733 *
734 * Do this in two main stages: first we copy dirty buffers to a
735 * temporary inode list, queueing the writes as we go. Then we clean
736 * up, waiting for those writes to complete.
737 *
738 * During this second stage, any subsequent updates to the file may end
739 * up refiling the buffer on the original inode's dirty list again, so
740 * there is a chance we will end up with a buffer queued for write but
741 * not yet completed on that list. So, as a final cleanup we go through
742 * the osync code to catch these locked, dirty buffers without requeuing
743 * any newly dirty buffers for write.
744 */
745static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
746{
747    struct buffer_head *bh;
748    struct list_head tmp;
749    struct address_space *mapping;
750    int err = 0, err2;
751    struct blk_plug plug;
752
753    INIT_LIST_HEAD(&tmp);
754    blk_start_plug(&plug);
755
756    spin_lock(lock);
757    while (!list_empty(list)) {
758        bh = BH_ENTRY(list->next);
759        mapping = bh->b_assoc_map;
760        __remove_assoc_queue(bh);
761        /* Avoid race with mark_buffer_dirty_inode() which does
762         * a lockless check and we rely on seeing the dirty bit */
763        smp_mb();
764        if (buffer_dirty(bh) || buffer_locked(bh)) {
765            list_add(&bh->b_assoc_buffers, &tmp);
766            bh->b_assoc_map = mapping;
767            if (buffer_dirty(bh)) {
768                get_bh(bh);
769                spin_unlock(lock);
770                /*
771                 * Ensure any pending I/O completes so that
772                 * write_dirty_buffer() actually writes the
773                 * current contents - it is a noop if I/O is
774                 * still in flight on potentially older
775                 * contents.
776                 */
777                write_dirty_buffer(bh, WRITE_SYNC);
778
779                /*
780                 * Kick off IO for the previous mapping. Note
781                 * that we will not run the very last mapping,
782                 * wait_on_buffer() will do that for us
783                 * through sync_buffer().
784                 */
785                brelse(bh);
786                spin_lock(lock);
787            }
788        }
789    }
790
791    spin_unlock(lock);
792    blk_finish_plug(&plug);
793    spin_lock(lock);
794
795    while (!list_empty(&tmp)) {
796        bh = BH_ENTRY(tmp.prev);
797        get_bh(bh);
798        mapping = bh->b_assoc_map;
799        __remove_assoc_queue(bh);
800        /* Avoid race with mark_buffer_dirty_inode() which does
801         * a lockless check and we rely on seeing the dirty bit */
802        smp_mb();
803        if (buffer_dirty(bh)) {
804            list_add(&bh->b_assoc_buffers,
805                 &mapping->private_list);
806            bh->b_assoc_map = mapping;
807        }
808        spin_unlock(lock);
809        wait_on_buffer(bh);
810        if (!buffer_uptodate(bh))
811            err = -EIO;
812        brelse(bh);
813        spin_lock(lock);
814    }
815    
816    spin_unlock(lock);
817    err2 = osync_buffers_list(lock, list);
818    if (err)
819        return err;
820    else
821        return err2;
822}
823
824/*
825 * Invalidate any and all dirty buffers on a given inode. We are
826 * probably unmounting the fs, but that doesn't mean we have already
827 * done a sync(). Just drop the buffers from the inode list.
828 *
829 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
830 * assumes that all the buffers are against the blockdev. Not true
831 * for reiserfs.
832 */
833void invalidate_inode_buffers(struct inode *inode)
834{
835    if (inode_has_buffers(inode)) {
836        struct address_space *mapping = &inode->i_data;
837        struct list_head *list = &mapping->private_list;
838        struct address_space *buffer_mapping = mapping->assoc_mapping;
839
840        spin_lock(&buffer_mapping->private_lock);
841        while (!list_empty(list))
842            __remove_assoc_queue(BH_ENTRY(list->next));
843        spin_unlock(&buffer_mapping->private_lock);
844    }
845}
846EXPORT_SYMBOL(invalidate_inode_buffers);
847
848/*
849 * Remove any clean buffers from the inode's buffer list. This is called
850 * when we're trying to free the inode itself. Those buffers can pin it.
851 *
852 * Returns true if all buffers were removed.
853 */
854int remove_inode_buffers(struct inode *inode)
855{
856    int ret = 1;
857
858    if (inode_has_buffers(inode)) {
859        struct address_space *mapping = &inode->i_data;
860        struct list_head *list = &mapping->private_list;
861        struct address_space *buffer_mapping = mapping->assoc_mapping;
862
863        spin_lock(&buffer_mapping->private_lock);
864        while (!list_empty(list)) {
865            struct buffer_head *bh = BH_ENTRY(list->next);
866            if (buffer_dirty(bh)) {
867                ret = 0;
868                break;
869            }
870            __remove_assoc_queue(bh);
871        }
872        spin_unlock(&buffer_mapping->private_lock);
873    }
874    return ret;
875}
876
877/*
878 * Create the appropriate buffers when given a page for data area and
879 * the size of each buffer.. Use the bh->b_this_page linked list to
880 * follow the buffers created. Return NULL if unable to create more
881 * buffers.
882 *
883 * The retry flag is used to differentiate async IO (paging, swapping)
884 * which may not fail from ordinary buffer allocations.
885 */
886struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
887        int retry)
888{
889    struct buffer_head *bh, *head;
890    long offset;
891
892try_again:
893    head = NULL;
894    offset = PAGE_SIZE;
895    while ((offset -= size) >= 0) {
896        bh = alloc_buffer_head(GFP_NOFS);
897        if (!bh)
898            goto no_grow;
899
900        bh->b_bdev = NULL;
901        bh->b_this_page = head;
902        bh->b_blocknr = -1;
903        head = bh;
904
905        bh->b_state = 0;
906        atomic_set(&bh->b_count, 0);
907        bh->b_size = size;
908
909        /* Link the buffer to its page */
910        set_bh_page(bh, page, offset);
911
912        init_buffer(bh, NULL, NULL);
913    }
914    return head;
915/*
916 * In case anything failed, we just free everything we got.
917 */
918no_grow:
919    if (head) {
920        do {
921            bh = head;
922            head = head->b_this_page;
923            free_buffer_head(bh);
924        } while (head);
925    }
926
927    /*
928     * Return failure for non-async IO requests. Async IO requests
929     * are not allowed to fail, so we have to wait until buffer heads
930     * become available. But we don't want tasks sleeping with
931     * partially complete buffers, so all were released above.
932     */
933    if (!retry)
934        return NULL;
935
936    /* We're _really_ low on memory. Now we just
937     * wait for old buffer heads to become free due to
938     * finishing IO. Since this is an async request and
939     * the reserve list is empty, we're sure there are
940     * async buffer heads in use.
941     */
942    free_more_memory();
943    goto try_again;
944}
945EXPORT_SYMBOL_GPL(alloc_page_buffers);
946
947static inline void
948link_dev_buffers(struct page *page, struct buffer_head *head)
949{
950    struct buffer_head *bh, *tail;
951
952    bh = head;
953    do {
954        tail = bh;
955        bh = bh->b_this_page;
956    } while (bh);
957    tail->b_this_page = head;
958    attach_page_buffers(page, head);
959}
960
961/*
962 * Initialise the state of a blockdev page's buffers.
963 */
964static void
965init_page_buffers(struct page *page, struct block_device *bdev,
966            sector_t block, int size)
967{
968    struct buffer_head *head = page_buffers(page);
969    struct buffer_head *bh = head;
970    int uptodate = PageUptodate(page);
971
972    do {
973        if (!buffer_mapped(bh)) {
974            init_buffer(bh, NULL, NULL);
975            bh->b_bdev = bdev;
976            bh->b_blocknr = block;
977            if (uptodate)
978                set_buffer_uptodate(bh);
979            set_buffer_mapped(bh);
980        }
981        block++;
982        bh = bh->b_this_page;
983    } while (bh != head);
984}
985
986/*
987 * Create the page-cache page that contains the requested block.
988 *
989 * This is user purely for blockdev mappings.
990 */
991static struct page *
992grow_dev_page(struct block_device *bdev, sector_t block,
993        pgoff_t index, int size)
994{
995    struct inode *inode = bdev->bd_inode;
996    struct page *page;
997    struct buffer_head *bh;
998
999    page = find_or_create_page(inode->i_mapping, index,
1000        (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1001    if (!page)
1002        return NULL;
1003
1004    BUG_ON(!PageLocked(page));
1005
1006    if (page_has_buffers(page)) {
1007        bh = page_buffers(page);
1008        if (bh->b_size == size) {
1009            init_page_buffers(page, bdev, block, size);
1010            return page;
1011        }
1012        if (!try_to_free_buffers(page))
1013            goto failed;
1014    }
1015
1016    /*
1017     * Allocate some buffers for this page
1018     */
1019    bh = alloc_page_buffers(page, size, 0);
1020    if (!bh)
1021        goto failed;
1022
1023    /*
1024     * Link the page to the buffers and initialise them. Take the
1025     * lock to be atomic wrt __find_get_block(), which does not
1026     * run under the page lock.
1027     */
1028    spin_lock(&inode->i_mapping->private_lock);
1029    link_dev_buffers(page, bh);
1030    init_page_buffers(page, bdev, block, size);
1031    spin_unlock(&inode->i_mapping->private_lock);
1032    return page;
1033
1034failed:
1035    BUG();
1036    unlock_page(page);
1037    page_cache_release(page);
1038    return NULL;
1039}
1040
1041/*
1042 * Create buffers for the specified block device block's page. If
1043 * that page was dirty, the buffers are set dirty also.
1044 */
1045static int
1046grow_buffers(struct block_device *bdev, sector_t block, int size)
1047{
1048    struct page *page;
1049    pgoff_t index;
1050    int sizebits;
1051
1052    sizebits = -1;
1053    do {
1054        sizebits++;
1055    } while ((size << sizebits) < PAGE_SIZE);
1056
1057    index = block >> sizebits;
1058
1059    /*
1060     * Check for a block which wants to lie outside our maximum possible
1061     * pagecache index. (this comparison is done using sector_t types).
1062     */
1063    if (unlikely(index != block >> sizebits)) {
1064        char b[BDEVNAME_SIZE];
1065
1066        printk(KERN_ERR "%s: requested out-of-range block %llu for "
1067            "device %s\n",
1068            __func__, (unsigned long long)block,
1069            bdevname(bdev, b));
1070        return -EIO;
1071    }
1072    block = index << sizebits;
1073    /* Create a page with the proper size buffers.. */
1074    page = grow_dev_page(bdev, block, index, size);
1075    if (!page)
1076        return 0;
1077    unlock_page(page);
1078    page_cache_release(page);
1079    return 1;
1080}
1081
1082static struct buffer_head *
1083__getblk_slow(struct block_device *bdev, sector_t block, int size)
1084{
1085    /* Size must be multiple of hard sectorsize */
1086    if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1087            (size < 512 || size > PAGE_SIZE))) {
1088        printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1089                    size);
1090        printk(KERN_ERR "logical block size: %d\n",
1091                    bdev_logical_block_size(bdev));
1092
1093        dump_stack();
1094        return NULL;
1095    }
1096
1097    for (;;) {
1098        struct buffer_head * bh;
1099        int ret;
1100
1101        bh = __find_get_block(bdev, block, size);
1102        if (bh)
1103            return bh;
1104
1105        ret = grow_buffers(bdev, block, size);
1106        if (ret < 0)
1107            return NULL;
1108        if (ret == 0)
1109            free_more_memory();
1110    }
1111}
1112
1113/*
1114 * The relationship between dirty buffers and dirty pages:
1115 *
1116 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1117 * the page is tagged dirty in its radix tree.
1118 *
1119 * At all times, the dirtiness of the buffers represents the dirtiness of
1120 * subsections of the page. If the page has buffers, the page dirty bit is
1121 * merely a hint about the true dirty state.
1122 *
1123 * When a page is set dirty in its entirety, all its buffers are marked dirty
1124 * (if the page has buffers).
1125 *
1126 * When a buffer is marked dirty, its page is dirtied, but the page's other
1127 * buffers are not.
1128 *
1129 * Also. When blockdev buffers are explicitly read with bread(), they
1130 * individually become uptodate. But their backing page remains not
1131 * uptodate - even if all of its buffers are uptodate. A subsequent
1132 * block_read_full_page() against that page will discover all the uptodate
1133 * buffers, will set the page uptodate and will perform no I/O.
1134 */
1135
1136/**
1137 * mark_buffer_dirty - mark a buffer_head as needing writeout
1138 * @bh: the buffer_head to mark dirty
1139 *
1140 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1141 * backing page dirty, then tag the page as dirty in its address_space's radix
1142 * tree and then attach the address_space's inode to its superblock's dirty
1143 * inode list.
1144 *
1145 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1146 * mapping->tree_lock and mapping->host->i_lock.
1147 */
1148void mark_buffer_dirty(struct buffer_head *bh)
1149{
1150    WARN_ON_ONCE(!buffer_uptodate(bh));
1151
1152    /*
1153     * Very *carefully* optimize the it-is-already-dirty case.
1154     *
1155     * Don't let the final "is it dirty" escape to before we
1156     * perhaps modified the buffer.
1157     */
1158    if (buffer_dirty(bh)) {
1159        smp_mb();
1160        if (buffer_dirty(bh))
1161            return;
1162    }
1163
1164    if (!test_set_buffer_dirty(bh)) {
1165        struct page *page = bh->b_page;
1166        if (!TestSetPageDirty(page)) {
1167            struct address_space *mapping = page_mapping(page);
1168            if (mapping)
1169                __set_page_dirty(page, mapping, 0);
1170        }
1171    }
1172}
1173EXPORT_SYMBOL(mark_buffer_dirty);
1174
1175/*
1176 * Decrement a buffer_head's reference count. If all buffers against a page
1177 * have zero reference count, are clean and unlocked, and if the page is clean
1178 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1179 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1180 * a page but it ends up not being freed, and buffers may later be reattached).
1181 */
1182void __brelse(struct buffer_head * buf)
1183{
1184    if (atomic_read(&buf->b_count)) {
1185        put_bh(buf);
1186        return;
1187    }
1188    WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1189}
1190EXPORT_SYMBOL(__brelse);
1191
1192/*
1193 * bforget() is like brelse(), except it discards any
1194 * potentially dirty data.
1195 */
1196void __bforget(struct buffer_head *bh)
1197{
1198    clear_buffer_dirty(bh);
1199    if (bh->b_assoc_map) {
1200        struct address_space *buffer_mapping = bh->b_page->mapping;
1201
1202        spin_lock(&buffer_mapping->private_lock);
1203        list_del_init(&bh->b_assoc_buffers);
1204        bh->b_assoc_map = NULL;
1205        spin_unlock(&buffer_mapping->private_lock);
1206    }
1207    __brelse(bh);
1208}
1209EXPORT_SYMBOL(__bforget);
1210
1211static struct buffer_head *__bread_slow(struct buffer_head *bh)
1212{
1213    lock_buffer(bh);
1214    if (buffer_uptodate(bh)) {
1215        unlock_buffer(bh);
1216        return bh;
1217    } else {
1218        get_bh(bh);
1219        bh->b_end_io = end_buffer_read_sync;
1220        submit_bh(READ, bh);
1221        wait_on_buffer(bh);
1222        if (buffer_uptodate(bh))
1223            return bh;
1224    }
1225    brelse(bh);
1226    return NULL;
1227}
1228
1229/*
1230 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1231 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1232 * refcount elevated by one when they're in an LRU. A buffer can only appear
1233 * once in a particular CPU's LRU. A single buffer can be present in multiple
1234 * CPU's LRUs at the same time.
1235 *
1236 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1237 * sb_find_get_block().
1238 *
1239 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1240 * a local interrupt disable for that.
1241 */
1242
1243#define BH_LRU_SIZE 8
1244
1245struct bh_lru {
1246    struct buffer_head *bhs[BH_LRU_SIZE];
1247};
1248
1249static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1250
1251#ifdef CONFIG_SMP
1252#define bh_lru_lock() local_irq_disable()
1253#define bh_lru_unlock() local_irq_enable()
1254#else
1255#define bh_lru_lock() preempt_disable()
1256#define bh_lru_unlock() preempt_enable()
1257#endif
1258
1259static inline void check_irqs_on(void)
1260{
1261#ifdef irqs_disabled
1262    BUG_ON(irqs_disabled());
1263#endif
1264}
1265
1266/*
1267 * The LRU management algorithm is dopey-but-simple. Sorry.
1268 */
1269static void bh_lru_install(struct buffer_head *bh)
1270{
1271    struct buffer_head *evictee = NULL;
1272
1273    check_irqs_on();
1274    bh_lru_lock();
1275    if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1276        struct buffer_head *bhs[BH_LRU_SIZE];
1277        int in;
1278        int out = 0;
1279
1280        get_bh(bh);
1281        bhs[out++] = bh;
1282        for (in = 0; in < BH_LRU_SIZE; in++) {
1283            struct buffer_head *bh2 =
1284                __this_cpu_read(bh_lrus.bhs[in]);
1285
1286            if (bh2 == bh) {
1287                __brelse(bh2);
1288            } else {
1289                if (out >= BH_LRU_SIZE) {
1290                    BUG_ON(evictee != NULL);
1291                    evictee = bh2;
1292                } else {
1293                    bhs[out++] = bh2;
1294                }
1295            }
1296        }
1297        while (out < BH_LRU_SIZE)
1298            bhs[out++] = NULL;
1299        memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1300    }
1301    bh_lru_unlock();
1302
1303    if (evictee)
1304        __brelse(evictee);
1305}
1306
1307/*
1308 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1309 */
1310static struct buffer_head *
1311lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1312{
1313    struct buffer_head *ret = NULL;
1314    unsigned int i;
1315
1316    check_irqs_on();
1317    bh_lru_lock();
1318    for (i = 0; i < BH_LRU_SIZE; i++) {
1319        struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1320
1321        if (bh && bh->b_bdev == bdev &&
1322                bh->b_blocknr == block && bh->b_size == size) {
1323            if (i) {
1324                while (i) {
1325                    __this_cpu_write(bh_lrus.bhs[i],
1326                        __this_cpu_read(bh_lrus.bhs[i - 1]));
1327                    i--;
1328                }
1329                __this_cpu_write(bh_lrus.bhs[0], bh);
1330            }
1331            get_bh(bh);
1332            ret = bh;
1333            break;
1334        }
1335    }
1336    bh_lru_unlock();
1337    return ret;
1338}
1339
1340/*
1341 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1342 * it in the LRU and mark it as accessed. If it is not present then return
1343 * NULL
1344 */
1345struct buffer_head *
1346__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1347{
1348    struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1349
1350    if (bh == NULL) {
1351        bh = __find_get_block_slow(bdev, block);
1352        if (bh)
1353            bh_lru_install(bh);
1354    }
1355    if (bh)
1356        touch_buffer(bh);
1357    return bh;
1358}
1359EXPORT_SYMBOL(__find_get_block);
1360
1361/*
1362 * __getblk will locate (and, if necessary, create) the buffer_head
1363 * which corresponds to the passed block_device, block and size. The
1364 * returned buffer has its reference count incremented.
1365 *
1366 * __getblk() cannot fail - it just keeps trying. If you pass it an
1367 * illegal block number, __getblk() will happily return a buffer_head
1368 * which represents the non-existent block. Very weird.
1369 *
1370 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1371 * attempt is failing. FIXME, perhaps?
1372 */
1373struct buffer_head *
1374__getblk(struct block_device *bdev, sector_t block, unsigned size)
1375{
1376    struct buffer_head *bh = __find_get_block(bdev, block, size);
1377
1378    might_sleep();
1379    if (bh == NULL)
1380        bh = __getblk_slow(bdev, block, size);
1381    return bh;
1382}
1383EXPORT_SYMBOL(__getblk);
1384
1385/*
1386 * Do async read-ahead on a buffer..
1387 */
1388void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1389{
1390    struct buffer_head *bh = __getblk(bdev, block, size);
1391    if (likely(bh)) {
1392        ll_rw_block(READA, 1, &bh);
1393        brelse(bh);
1394    }
1395}
1396EXPORT_SYMBOL(__breadahead);
1397
1398/**
1399 * __bread() - reads a specified block and returns the bh
1400 * @bdev: the block_device to read from
1401 * @block: number of block
1402 * @size: size (in bytes) to read
1403 *
1404 * Reads a specified block, and returns buffer head that contains it.
1405 * It returns NULL if the block was unreadable.
1406 */
1407struct buffer_head *
1408__bread(struct block_device *bdev, sector_t block, unsigned size)
1409{
1410    struct buffer_head *bh = __getblk(bdev, block, size);
1411
1412    if (likely(bh) && !buffer_uptodate(bh))
1413        bh = __bread_slow(bh);
1414    return bh;
1415}
1416EXPORT_SYMBOL(__bread);
1417
1418/*
1419 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1420 * This doesn't race because it runs in each cpu either in irq
1421 * or with preempt disabled.
1422 */
1423static void invalidate_bh_lru(void *arg)
1424{
1425    struct bh_lru *b = &get_cpu_var(bh_lrus);
1426    int i;
1427
1428    for (i = 0; i < BH_LRU_SIZE; i++) {
1429        brelse(b->bhs[i]);
1430        b->bhs[i] = NULL;
1431    }
1432    put_cpu_var(bh_lrus);
1433}
1434    
1435void invalidate_bh_lrus(void)
1436{
1437    on_each_cpu(invalidate_bh_lru, NULL, 1);
1438}
1439EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1440
1441void set_bh_page(struct buffer_head *bh,
1442        struct page *page, unsigned long offset)
1443{
1444    bh->b_page = page;
1445    BUG_ON(offset >= PAGE_SIZE);
1446    if (PageHighMem(page))
1447        /*
1448         * This catches illegal uses and preserves the offset:
1449         */
1450        bh->b_data = (char *)(0 + offset);
1451    else
1452        bh->b_data = page_address(page) + offset;
1453}
1454EXPORT_SYMBOL(set_bh_page);
1455
1456/*
1457 * Called when truncating a buffer on a page completely.
1458 */
1459static void discard_buffer(struct buffer_head * bh)
1460{
1461    lock_buffer(bh);
1462    clear_buffer_dirty(bh);
1463    bh->b_bdev = NULL;
1464    clear_buffer_mapped(bh);
1465    clear_buffer_req(bh);
1466    clear_buffer_new(bh);
1467    clear_buffer_delay(bh);
1468    clear_buffer_unwritten(bh);
1469    unlock_buffer(bh);
1470}
1471
1472/**
1473 * block_invalidatepage - invalidate part of all of a buffer-backed page
1474 *
1475 * @page: the page which is affected
1476 * @offset: the index of the truncation point
1477 *
1478 * block_invalidatepage() is called when all or part of the page has become
1479 * invalidatedby a truncate operation.
1480 *
1481 * block_invalidatepage() does not have to release all buffers, but it must
1482 * ensure that no dirty buffer is left outside @offset and that no I/O
1483 * is underway against any of the blocks which are outside the truncation
1484 * point. Because the caller is about to free (and possibly reuse) those
1485 * blocks on-disk.
1486 */
1487void block_invalidatepage(struct page *page, unsigned long offset)
1488{
1489    struct buffer_head *head, *bh, *next;
1490    unsigned int curr_off = 0;
1491
1492    BUG_ON(!PageLocked(page));
1493    if (!page_has_buffers(page))
1494        goto out;
1495
1496    head = page_buffers(page);
1497    bh = head;
1498    do {
1499        unsigned int next_off = curr_off + bh->b_size;
1500        next = bh->b_this_page;
1501
1502        /*
1503         * is this block fully invalidated?
1504         */
1505        if (offset <= curr_off)
1506            discard_buffer(bh);
1507        curr_off = next_off;
1508        bh = next;
1509    } while (bh != head);
1510
1511    /*
1512     * We release buffers only if the entire page is being invalidated.
1513     * The get_block cached value has been unconditionally invalidated,
1514     * so real IO is not possible anymore.
1515     */
1516    if (offset == 0)
1517        try_to_release_page(page, 0);
1518out:
1519    return;
1520}
1521EXPORT_SYMBOL(block_invalidatepage);
1522
1523/*
1524 * We attach and possibly dirty the buffers atomically wrt
1525 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1526 * is already excluded via the page lock.
1527 */
1528void create_empty_buffers(struct page *page,
1529            unsigned long blocksize, unsigned long b_state)
1530{
1531    struct buffer_head *bh, *head, *tail;
1532
1533    head = alloc_page_buffers(page, blocksize, 1);
1534    bh = head;
1535    do {
1536        bh->b_state |= b_state;
1537        tail = bh;
1538        bh = bh->b_this_page;
1539    } while (bh);
1540    tail->b_this_page = head;
1541
1542    spin_lock(&page->mapping->private_lock);
1543    if (PageUptodate(page) || PageDirty(page)) {
1544        bh = head;
1545        do {
1546            if (PageDirty(page))
1547                set_buffer_dirty(bh);
1548            if (PageUptodate(page))
1549                set_buffer_uptodate(bh);
1550            bh = bh->b_this_page;
1551        } while (bh != head);
1552    }
1553    attach_page_buffers(page, head);
1554    spin_unlock(&page->mapping->private_lock);
1555}
1556EXPORT_SYMBOL(create_empty_buffers);
1557
1558/*
1559 * We are taking a block for data and we don't want any output from any
1560 * buffer-cache aliases starting from return from that function and
1561 * until the moment when something will explicitly mark the buffer
1562 * dirty (hopefully that will not happen until we will free that block ;-)
1563 * We don't even need to mark it not-uptodate - nobody can expect
1564 * anything from a newly allocated buffer anyway. We used to used
1565 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1566 * don't want to mark the alias unmapped, for example - it would confuse
1567 * anyone who might pick it with bread() afterwards...
1568 *
1569 * Also.. Note that bforget() doesn't lock the buffer. So there can
1570 * be writeout I/O going on against recently-freed buffers. We don't
1571 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1572 * only if we really need to. That happens here.
1573 */
1574void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1575{
1576    struct buffer_head *old_bh;
1577
1578    might_sleep();
1579
1580    old_bh = __find_get_block_slow(bdev, block);
1581    if (old_bh) {
1582        clear_buffer_dirty(old_bh);
1583        wait_on_buffer(old_bh);
1584        clear_buffer_req(old_bh);
1585        __brelse(old_bh);
1586    }
1587}
1588EXPORT_SYMBOL(unmap_underlying_metadata);
1589
1590/*
1591 * NOTE! All mapped/uptodate combinations are valid:
1592 *
1593 * Mapped Uptodate Meaning
1594 *
1595 * No No "unknown" - must do get_block()
1596 * No Yes "hole" - zero-filled
1597 * Yes No "allocated" - allocated on disk, not read in
1598 * Yes Yes "valid" - allocated and up-to-date in memory.
1599 *
1600 * "Dirty" is valid only with the last case (mapped+uptodate).
1601 */
1602
1603/*
1604 * While block_write_full_page is writing back the dirty buffers under
1605 * the page lock, whoever dirtied the buffers may decide to clean them
1606 * again at any time. We handle that by only looking at the buffer
1607 * state inside lock_buffer().
1608 *
1609 * If block_write_full_page() is called for regular writeback
1610 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1611 * locked buffer. This only can happen if someone has written the buffer
1612 * directly, with submit_bh(). At the address_space level PageWriteback
1613 * prevents this contention from occurring.
1614 *
1615 * If block_write_full_page() is called with wbc->sync_mode ==
1616 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1617 * causes the writes to be flagged as synchronous writes.
1618 */
1619static int __block_write_full_page(struct inode *inode, struct page *page,
1620            get_block_t *get_block, struct writeback_control *wbc,
1621            bh_end_io_t *handler)
1622{
1623    int err;
1624    sector_t block;
1625    sector_t last_block;
1626    struct buffer_head *bh, *head;
1627    const unsigned blocksize = 1 << inode->i_blkbits;
1628    int nr_underway = 0;
1629    int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1630            WRITE_SYNC : WRITE);
1631
1632    BUG_ON(!PageLocked(page));
1633
1634    last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1635
1636    if (!page_has_buffers(page)) {
1637        create_empty_buffers(page, blocksize,
1638                    (1 << BH_Dirty)|(1 << BH_Uptodate));
1639    }
1640
1641    /*
1642     * Be very careful. We have no exclusion from __set_page_dirty_buffers
1643     * here, and the (potentially unmapped) buffers may become dirty at
1644     * any time. If a buffer becomes dirty here after we've inspected it
1645     * then we just miss that fact, and the page stays dirty.
1646     *
1647     * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1648     * handle that here by just cleaning them.
1649     */
1650
1651    block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1652    head = page_buffers(page);
1653    bh = head;
1654
1655    /*
1656     * Get all the dirty buffers mapped to disk addresses and
1657     * handle any aliases from the underlying blockdev's mapping.
1658     */
1659    do {
1660        if (block > last_block) {
1661            /*
1662             * mapped buffers outside i_size will occur, because
1663             * this page can be outside i_size when there is a
1664             * truncate in progress.
1665             */
1666            /*
1667             * The buffer was zeroed by block_write_full_page()
1668             */
1669            clear_buffer_dirty(bh);
1670            set_buffer_uptodate(bh);
1671        } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1672               buffer_dirty(bh)) {
1673            WARN_ON(bh->b_size != blocksize);
1674            err = get_block(inode, block, bh, 1);
1675            if (err)
1676                goto recover;
1677            clear_buffer_delay(bh);
1678            if (buffer_new(bh)) {
1679                /* blockdev mappings never come here */
1680                clear_buffer_new(bh);
1681                unmap_underlying_metadata(bh->b_bdev,
1682                            bh->b_blocknr);
1683            }
1684        }
1685        bh = bh->b_this_page;
1686        block++;
1687    } while (bh != head);
1688
1689    do {
1690        if (!buffer_mapped(bh))
1691            continue;
1692        /*
1693         * If it's a fully non-blocking write attempt and we cannot
1694         * lock the buffer then redirty the page. Note that this can
1695         * potentially cause a busy-wait loop from writeback threads
1696         * and kswapd activity, but those code paths have their own
1697         * higher-level throttling.
1698         */
1699        if (wbc->sync_mode != WB_SYNC_NONE) {
1700            lock_buffer(bh);
1701        } else if (!trylock_buffer(bh)) {
1702            redirty_page_for_writepage(wbc, page);
1703            continue;
1704        }
1705        if (test_clear_buffer_dirty(bh)) {
1706            mark_buffer_async_write_endio(bh, handler);
1707        } else {
1708            unlock_buffer(bh);
1709        }
1710    } while ((bh = bh->b_this_page) != head);
1711
1712    /*
1713     * The page and its buffers are protected by PageWriteback(), so we can
1714     * drop the bh refcounts early.
1715     */
1716    BUG_ON(PageWriteback(page));
1717    set_page_writeback(page);
1718
1719    do {
1720        struct buffer_head *next = bh->b_this_page;
1721        if (buffer_async_write(bh)) {
1722            submit_bh(write_op, bh);
1723            nr_underway++;
1724        }
1725        bh = next;
1726    } while (bh != head);
1727    unlock_page(page);
1728
1729    err = 0;
1730done:
1731    if (nr_underway == 0) {
1732        /*
1733         * The page was marked dirty, but the buffers were
1734         * clean. Someone wrote them back by hand with
1735         * ll_rw_block/submit_bh. A rare case.
1736         */
1737        end_page_writeback(page);
1738
1739        /*
1740         * The page and buffer_heads can be released at any time from
1741         * here on.
1742         */
1743    }
1744    return err;
1745
1746recover:
1747    /*
1748     * ENOSPC, or some other error. We may already have added some
1749     * blocks to the file, so we need to write these out to avoid
1750     * exposing stale data.
1751     * The page is currently locked and not marked for writeback
1752     */
1753    bh = head;
1754    /* Recovery: lock and submit the mapped buffers */
1755    do {
1756        if (buffer_mapped(bh) && buffer_dirty(bh) &&
1757            !buffer_delay(bh)) {
1758            lock_buffer(bh);
1759            mark_buffer_async_write_endio(bh, handler);
1760        } else {
1761            /*
1762             * The buffer may have been set dirty during
1763             * attachment to a dirty page.
1764             */
1765            clear_buffer_dirty(bh);
1766        }
1767    } while ((bh = bh->b_this_page) != head);
1768    SetPageError(page);
1769    BUG_ON(PageWriteback(page));
1770    mapping_set_error(page->mapping, err);
1771    set_page_writeback(page);
1772    do {
1773        struct buffer_head *next = bh->b_this_page;
1774        if (buffer_async_write(bh)) {
1775            clear_buffer_dirty(bh);
1776            submit_bh(write_op, bh);
1777            nr_underway++;
1778        }
1779        bh = next;
1780    } while (bh != head);
1781    unlock_page(page);
1782    goto done;
1783}
1784
1785/*
1786 * If a page has any new buffers, zero them out here, and mark them uptodate
1787 * and dirty so they'll be written out (in order to prevent uninitialised
1788 * block data from leaking). And clear the new bit.
1789 */
1790void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1791{
1792    unsigned int block_start, block_end;
1793    struct buffer_head *head, *bh;
1794
1795    BUG_ON(!PageLocked(page));
1796    if (!page_has_buffers(page))
1797        return;
1798
1799    bh = head = page_buffers(page);
1800    block_start = 0;
1801    do {
1802        block_end = block_start + bh->b_size;
1803
1804        if (buffer_new(bh)) {
1805            if (block_end > from && block_start < to) {
1806                if (!PageUptodate(page)) {
1807                    unsigned start, size;
1808
1809                    start = max(from, block_start);
1810                    size = min(to, block_end) - start;
1811
1812                    zero_user(page, start, size);
1813                    set_buffer_uptodate(bh);
1814                }
1815
1816                clear_buffer_new(bh);
1817                mark_buffer_dirty(bh);
1818            }
1819        }
1820
1821        block_start = block_end;
1822        bh = bh->b_this_page;
1823    } while (bh != head);
1824}
1825EXPORT_SYMBOL(page_zero_new_buffers);
1826
1827int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1828        get_block_t *get_block)
1829{
1830    unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1831    unsigned to = from + len;
1832    struct inode *inode = page->mapping->host;
1833    unsigned block_start, block_end;
1834    sector_t block;
1835    int err = 0;
1836    unsigned blocksize, bbits;
1837    struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1838
1839    BUG_ON(!PageLocked(page));
1840    BUG_ON(from > PAGE_CACHE_SIZE);
1841    BUG_ON(to > PAGE_CACHE_SIZE);
1842    BUG_ON(from > to);
1843
1844    blocksize = 1 << inode->i_blkbits;
1845    if (!page_has_buffers(page))
1846        create_empty_buffers(page, blocksize, 0);
1847    head = page_buffers(page);
1848
1849    bbits = inode->i_blkbits;
1850    block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1851
1852    for(bh = head, block_start = 0; bh != head || !block_start;
1853        block++, block_start=block_end, bh = bh->b_this_page) {
1854        block_end = block_start + blocksize;
1855        if (block_end <= from || block_start >= to) {
1856            if (PageUptodate(page)) {
1857                if (!buffer_uptodate(bh))
1858                    set_buffer_uptodate(bh);
1859            }
1860            continue;
1861        }
1862        if (buffer_new(bh))
1863            clear_buffer_new(bh);
1864        if (!buffer_mapped(bh)) {
1865            WARN_ON(bh->b_size != blocksize);
1866            err = get_block(inode, block, bh, 1);
1867            if (err)
1868                break;
1869            if (buffer_new(bh)) {
1870                unmap_underlying_metadata(bh->b_bdev,
1871                            bh->b_blocknr);
1872                if (PageUptodate(page)) {
1873                    clear_buffer_new(bh);
1874                    set_buffer_uptodate(bh);
1875                    mark_buffer_dirty(bh);
1876                    continue;
1877                }
1878                if (block_end > to || block_start < from)
1879                    zero_user_segments(page,
1880                        to, block_end,
1881                        block_start, from);
1882                continue;
1883            }
1884        }
1885        if (PageUptodate(page)) {
1886            if (!buffer_uptodate(bh))
1887                set_buffer_uptodate(bh);
1888            continue;
1889        }
1890        if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1891            !buffer_unwritten(bh) &&
1892             (block_start < from || block_end > to)) {
1893            ll_rw_block(READ, 1, &bh);
1894            *wait_bh++=bh;
1895        }
1896    }
1897    /*
1898     * If we issued read requests - let them complete.
1899     */
1900    while(wait_bh > wait) {
1901        wait_on_buffer(*--wait_bh);
1902        if (!buffer_uptodate(*wait_bh))
1903            err = -EIO;
1904    }
1905    if (unlikely(err))
1906        page_zero_new_buffers(page, from, to);
1907    return err;
1908}
1909EXPORT_SYMBOL(__block_write_begin);
1910
1911static int __block_commit_write(struct inode *inode, struct page *page,
1912        unsigned from, unsigned to)
1913{
1914    unsigned block_start, block_end;
1915    int partial = 0;
1916    unsigned blocksize;
1917    struct buffer_head *bh, *head;
1918
1919    blocksize = 1 << inode->i_blkbits;
1920
1921    for(bh = head = page_buffers(page), block_start = 0;
1922        bh != head || !block_start;
1923        block_start=block_end, bh = bh->b_this_page) {
1924        block_end = block_start + blocksize;
1925        if (block_end <= from || block_start >= to) {
1926            if (!buffer_uptodate(bh))
1927                partial = 1;
1928        } else {
1929            set_buffer_uptodate(bh);
1930            mark_buffer_dirty(bh);
1931        }
1932        clear_buffer_new(bh);
1933    }
1934
1935    /*
1936     * If this is a partial write which happened to make all buffers
1937     * uptodate then we can optimize away a bogus readpage() for
1938     * the next read(). Here we 'discover' whether the page went
1939     * uptodate as a result of this (potentially partial) write.
1940     */
1941    if (!partial)
1942        SetPageUptodate(page);
1943    return 0;
1944}
1945
1946/*
1947 * block_write_begin takes care of the basic task of block allocation and
1948 * bringing partial write blocks uptodate first.
1949 *
1950 * The filesystem needs to handle block truncation upon failure.
1951 */
1952int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1953        unsigned flags, struct page **pagep, get_block_t *get_block)
1954{
1955    pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1956    struct page *page;
1957    int status;
1958
1959    page = grab_cache_page_write_begin(mapping, index, flags);
1960    if (!page)
1961        return -ENOMEM;
1962
1963    status = __block_write_begin(page, pos, len, get_block);
1964    if (unlikely(status)) {
1965        unlock_page(page);
1966        page_cache_release(page);
1967        page = NULL;
1968    }
1969
1970    *pagep = page;
1971    return status;
1972}
1973EXPORT_SYMBOL(block_write_begin);
1974
1975int block_write_end(struct file *file, struct address_space *mapping,
1976            loff_t pos, unsigned len, unsigned copied,
1977            struct page *page, void *fsdata)
1978{
1979    struct inode *inode = mapping->host;
1980    unsigned start;
1981
1982    start = pos & (PAGE_CACHE_SIZE - 1);
1983
1984    if (unlikely(copied < len)) {
1985        /*
1986         * The buffers that were written will now be uptodate, so we
1987         * don't have to worry about a readpage reading them and
1988         * overwriting a partial write. However if we have encountered
1989         * a short write and only partially written into a buffer, it
1990         * will not be marked uptodate, so a readpage might come in and
1991         * destroy our partial write.
1992         *
1993         * Do the simplest thing, and just treat any short write to a
1994         * non uptodate page as a zero-length write, and force the
1995         * caller to redo the whole thing.
1996         */
1997        if (!PageUptodate(page))
1998            copied = 0;
1999
2000        page_zero_new_buffers(page, start+copied, start+len);
2001    }
2002    flush_dcache_page(page);
2003
2004    /* This could be a short (even 0-length) commit */
2005    __block_commit_write(inode, page, start, start+copied);
2006
2007    return copied;
2008}
2009EXPORT_SYMBOL(block_write_end);
2010
2011int generic_write_end(struct file *file, struct address_space *mapping,
2012            loff_t pos, unsigned len, unsigned copied,
2013            struct page *page, void *fsdata)
2014{
2015    struct inode *inode = mapping->host;
2016    int i_size_changed = 0;
2017
2018    copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2019
2020    /*
2021     * No need to use i_size_read() here, the i_size
2022     * cannot change under us because we hold i_mutex.
2023     *
2024     * But it's important to update i_size while still holding page lock:
2025     * page writeout could otherwise come in and zero beyond i_size.
2026     */
2027    if (pos+copied > inode->i_size) {
2028        i_size_write(inode, pos+copied);
2029        i_size_changed = 1;
2030    }
2031
2032    unlock_page(page);
2033    page_cache_release(page);
2034
2035    /*
2036     * Don't mark the inode dirty under page lock. First, it unnecessarily
2037     * makes the holding time of page lock longer. Second, it forces lock
2038     * ordering of page lock and transaction start for journaling
2039     * filesystems.
2040     */
2041    if (i_size_changed)
2042        mark_inode_dirty(inode);
2043
2044    return copied;
2045}
2046EXPORT_SYMBOL(generic_write_end);
2047
2048/*
2049 * block_is_partially_uptodate checks whether buffers within a page are
2050 * uptodate or not.
2051 *
2052 * Returns true if all buffers which correspond to a file portion
2053 * we want to read are uptodate.
2054 */
2055int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2056                    unsigned long from)
2057{
2058    struct inode *inode = page->mapping->host;
2059    unsigned block_start, block_end, blocksize;
2060    unsigned to;
2061    struct buffer_head *bh, *head;
2062    int ret = 1;
2063
2064    if (!page_has_buffers(page))
2065        return 0;
2066
2067    blocksize = 1 << inode->i_blkbits;
2068    to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2069    to = from + to;
2070    if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2071        return 0;
2072
2073    head = page_buffers(page);
2074    bh = head;
2075    block_start = 0;
2076    do {
2077        block_end = block_start + blocksize;
2078        if (block_end > from && block_start < to) {
2079            if (!buffer_uptodate(bh)) {
2080                ret = 0;
2081                break;
2082            }
2083            if (block_end >= to)
2084                break;
2085        }
2086        block_start = block_end;
2087        bh = bh->b_this_page;
2088    } while (bh != head);
2089
2090    return ret;
2091}
2092EXPORT_SYMBOL(block_is_partially_uptodate);
2093
2094/*
2095 * Generic "read page" function for block devices that have the normal
2096 * get_block functionality. This is most of the block device filesystems.
2097 * Reads the page asynchronously --- the unlock_buffer() and
2098 * set/clear_buffer_uptodate() functions propagate buffer state into the
2099 * page struct once IO has completed.
2100 */
2101int block_read_full_page(struct page *page, get_block_t *get_block)
2102{
2103    struct inode *inode = page->mapping->host;
2104    sector_t iblock, lblock;
2105    struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2106    unsigned int blocksize;
2107    int nr, i;
2108    int fully_mapped = 1;
2109
2110    BUG_ON(!PageLocked(page));
2111    blocksize = 1 << inode->i_blkbits;
2112    if (!page_has_buffers(page))
2113        create_empty_buffers(page, blocksize, 0);
2114    head = page_buffers(page);
2115
2116    iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2117    lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2118    bh = head;
2119    nr = 0;
2120    i = 0;
2121
2122    do {
2123        if (buffer_uptodate(bh))
2124            continue;
2125
2126        if (!buffer_mapped(bh)) {
2127            int err = 0;
2128
2129            fully_mapped = 0;
2130            if (iblock < lblock) {
2131                WARN_ON(bh->b_size != blocksize);
2132                err = get_block(inode, iblock, bh, 0);
2133                if (err)
2134                    SetPageError(page);
2135            }
2136            if (!buffer_mapped(bh)) {
2137                zero_user(page, i * blocksize, blocksize);
2138                if (!err)
2139                    set_buffer_uptodate(bh);
2140                continue;
2141            }
2142            /*
2143             * get_block() might have updated the buffer
2144             * synchronously
2145             */
2146            if (buffer_uptodate(bh))
2147                continue;
2148        }
2149        arr[nr++] = bh;
2150    } while (i++, iblock++, (bh = bh->b_this_page) != head);
2151
2152    if (fully_mapped)
2153        SetPageMappedToDisk(page);
2154
2155    if (!nr) {
2156        /*
2157         * All buffers are uptodate - we can set the page uptodate
2158         * as well. But not if get_block() returned an error.
2159         */
2160        if (!PageError(page))
2161            SetPageUptodate(page);
2162        unlock_page(page);
2163        return 0;
2164    }
2165
2166    /* Stage two: lock the buffers */
2167    for (i = 0; i < nr; i++) {
2168        bh = arr[i];
2169        lock_buffer(bh);
2170        mark_buffer_async_read(bh);
2171    }
2172
2173    /*
2174     * Stage 3: start the IO. Check for uptodateness
2175     * inside the buffer lock in case another process reading
2176     * the underlying blockdev brought it uptodate (the sct fix).
2177     */
2178    for (i = 0; i < nr; i++) {
2179        bh = arr[i];
2180        if (buffer_uptodate(bh))
2181            end_buffer_async_read(bh, 1);
2182        else
2183            submit_bh(READ, bh);
2184    }
2185    return 0;
2186}
2187EXPORT_SYMBOL(block_read_full_page);
2188
2189/* utility function for filesystems that need to do work on expanding
2190 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2191 * deal with the hole.
2192 */
2193int generic_cont_expand_simple(struct inode *inode, loff_t size)
2194{
2195    struct address_space *mapping = inode->i_mapping;
2196    struct page *page;
2197    void *fsdata;
2198    int err;
2199
2200    err = inode_newsize_ok(inode, size);
2201    if (err)
2202        goto out;
2203
2204    err = pagecache_write_begin(NULL, mapping, size, 0,
2205                AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2206                &page, &fsdata);
2207    if (err)
2208        goto out;
2209
2210    err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2211    BUG_ON(err > 0);
2212
2213out:
2214    return err;
2215}
2216EXPORT_SYMBOL(generic_cont_expand_simple);
2217
2218static int cont_expand_zero(struct file *file, struct address_space *mapping,
2219                loff_t pos, loff_t *bytes)
2220{
2221    struct inode *inode = mapping->host;
2222    unsigned blocksize = 1 << inode->i_blkbits;
2223    struct page *page;
2224    void *fsdata;
2225    pgoff_t index, curidx;
2226    loff_t curpos;
2227    unsigned zerofrom, offset, len;
2228    int err = 0;
2229
2230    index = pos >> PAGE_CACHE_SHIFT;
2231    offset = pos & ~PAGE_CACHE_MASK;
2232
2233    while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2234        zerofrom = curpos & ~PAGE_CACHE_MASK;
2235        if (zerofrom & (blocksize-1)) {
2236            *bytes |= (blocksize-1);
2237            (*bytes)++;
2238        }
2239        len = PAGE_CACHE_SIZE - zerofrom;
2240
2241        err = pagecache_write_begin(file, mapping, curpos, len,
2242                        AOP_FLAG_UNINTERRUPTIBLE,
2243                        &page, &fsdata);
2244        if (err)
2245            goto out;
2246        zero_user(page, zerofrom, len);
2247        err = pagecache_write_end(file, mapping, curpos, len, len,
2248                        page, fsdata);
2249        if (err < 0)
2250            goto out;
2251        BUG_ON(err != len);
2252        err = 0;
2253
2254        balance_dirty_pages_ratelimited(mapping);
2255    }
2256
2257    /* page covers the boundary, find the boundary offset */
2258    if (index == curidx) {
2259        zerofrom = curpos & ~PAGE_CACHE_MASK;
2260        /* if we will expand the thing last block will be filled */
2261        if (offset <= zerofrom) {
2262            goto out;
2263        }
2264        if (zerofrom & (blocksize-1)) {
2265            *bytes |= (blocksize-1);
2266            (*bytes)++;
2267        }
2268        len = offset - zerofrom;
2269
2270        err = pagecache_write_begin(file, mapping, curpos, len,
2271                        AOP_FLAG_UNINTERRUPTIBLE,
2272                        &page, &fsdata);
2273        if (err)
2274            goto out;
2275        zero_user(page, zerofrom, len);
2276        err = pagecache_write_end(file, mapping, curpos, len, len,
2277                        page, fsdata);
2278        if (err < 0)
2279            goto out;
2280        BUG_ON(err != len);
2281        err = 0;
2282    }
2283out:
2284    return err;
2285}
2286
2287/*
2288 * For moronic filesystems that do not allow holes in file.
2289 * We may have to extend the file.
2290 */
2291int cont_write_begin(struct file *file, struct address_space *mapping,
2292            loff_t pos, unsigned len, unsigned flags,
2293            struct page **pagep, void **fsdata,
2294            get_block_t *get_block, loff_t *bytes)
2295{
2296    struct inode *inode = mapping->host;
2297    unsigned blocksize = 1 << inode->i_blkbits;
2298    unsigned zerofrom;
2299    int err;
2300
2301    err = cont_expand_zero(file, mapping, pos, bytes);
2302    if (err)
2303        return err;
2304
2305    zerofrom = *bytes & ~PAGE_CACHE_MASK;
2306    if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2307        *bytes |= (blocksize-1);
2308        (*bytes)++;
2309    }
2310
2311    return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2312}
2313EXPORT_SYMBOL(cont_write_begin);
2314
2315int block_commit_write(struct page *page, unsigned from, unsigned to)
2316{
2317    struct inode *inode = page->mapping->host;
2318    __block_commit_write(inode,page,from,to);
2319    return 0;
2320}
2321EXPORT_SYMBOL(block_commit_write);
2322
2323/*
2324 * block_page_mkwrite() is not allowed to change the file size as it gets
2325 * called from a page fault handler when a page is first dirtied. Hence we must
2326 * be careful to check for EOF conditions here. We set the page up correctly
2327 * for a written page which means we get ENOSPC checking when writing into
2328 * holes and correct delalloc and unwritten extent mapping on filesystems that
2329 * support these features.
2330 *
2331 * We are not allowed to take the i_mutex here so we have to play games to
2332 * protect against truncate races as the page could now be beyond EOF. Because
2333 * truncate writes the inode size before removing pages, once we have the
2334 * page lock we can determine safely if the page is beyond EOF. If it is not
2335 * beyond EOF, then the page is guaranteed safe against truncation until we
2336 * unlock the page.
2337 *
2338 * Direct callers of this function should call vfs_check_frozen() so that page
2339 * fault does not busyloop until the fs is thawed.
2340 */
2341int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2342             get_block_t get_block)
2343{
2344    struct page *page = vmf->page;
2345    struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2346    unsigned long end;
2347    loff_t size;
2348    int ret;
2349
2350    lock_page(page);
2351    size = i_size_read(inode);
2352    if ((page->mapping != inode->i_mapping) ||
2353        (page_offset(page) > size)) {
2354        /* We overload EFAULT to mean page got truncated */
2355        ret = -EFAULT;
2356        goto out_unlock;
2357    }
2358
2359    /* page is wholly or partially inside EOF */
2360    if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2361        end = size & ~PAGE_CACHE_MASK;
2362    else
2363        end = PAGE_CACHE_SIZE;
2364
2365    ret = __block_write_begin(page, 0, end, get_block);
2366    if (!ret)
2367        ret = block_commit_write(page, 0, end);
2368
2369    if (unlikely(ret < 0))
2370        goto out_unlock;
2371    /*
2372     * Freezing in progress? We check after the page is marked dirty and
2373     * with page lock held so if the test here fails, we are sure freezing
2374     * code will wait during syncing until the page fault is done - at that
2375     * point page will be dirty and unlocked so freezing code will write it
2376     * and writeprotect it again.
2377     */
2378    set_page_dirty(page);
2379    if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2380        ret = -EAGAIN;
2381        goto out_unlock;
2382    }
2383    wait_on_page_writeback(page);
2384    return 0;
2385out_unlock:
2386    unlock_page(page);
2387    return ret;
2388}
2389EXPORT_SYMBOL(__block_page_mkwrite);
2390
2391int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2392           get_block_t get_block)
2393{
2394    int ret;
2395    struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2396
2397    /*
2398     * This check is racy but catches the common case. The check in
2399     * __block_page_mkwrite() is reliable.
2400     */
2401    vfs_check_frozen(sb, SB_FREEZE_WRITE);
2402    ret = __block_page_mkwrite(vma, vmf, get_block);
2403    return block_page_mkwrite_return(ret);
2404}
2405EXPORT_SYMBOL(block_page_mkwrite);
2406
2407/*
2408 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2409 * immediately, while under the page lock. So it needs a special end_io
2410 * handler which does not touch the bh after unlocking it.
2411 */
2412static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2413{
2414    __end_buffer_read_notouch(bh, uptodate);
2415}
2416
2417/*
2418 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2419 * the page (converting it to circular linked list and taking care of page
2420 * dirty races).
2421 */
2422static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2423{
2424    struct buffer_head *bh;
2425
2426    BUG_ON(!PageLocked(page));
2427
2428    spin_lock(&page->mapping->private_lock);
2429    bh = head;
2430    do {
2431        if (PageDirty(page))
2432            set_buffer_dirty(bh);
2433        if (!bh->b_this_page)
2434            bh->b_this_page = head;
2435        bh = bh->b_this_page;
2436    } while (bh != head);
2437    attach_page_buffers(page, head);
2438    spin_unlock(&page->mapping->private_lock);
2439}
2440
2441/*
2442 * On entry, the page is fully not uptodate.
2443 * On exit the page is fully uptodate in the areas outside (from,to)
2444 * The filesystem needs to handle block truncation upon failure.
2445 */
2446int nobh_write_begin(struct address_space *mapping,
2447            loff_t pos, unsigned len, unsigned flags,
2448            struct page **pagep, void **fsdata,
2449            get_block_t *get_block)
2450{
2451    struct inode *inode = mapping->host;
2452    const unsigned blkbits = inode->i_blkbits;
2453    const unsigned blocksize = 1 << blkbits;
2454    struct buffer_head *head, *bh;
2455    struct page *page;
2456    pgoff_t index;
2457    unsigned from, to;
2458    unsigned block_in_page;
2459    unsigned block_start, block_end;
2460    sector_t block_in_file;
2461    int nr_reads = 0;
2462    int ret = 0;
2463    int is_mapped_to_disk = 1;
2464
2465    index = pos >> PAGE_CACHE_SHIFT;
2466    from = pos & (PAGE_CACHE_SIZE - 1);
2467    to = from + len;
2468
2469    page = grab_cache_page_write_begin(mapping, index, flags);
2470    if (!page)
2471        return -ENOMEM;
2472    *pagep = page;
2473    *fsdata = NULL;
2474
2475    if (page_has_buffers(page)) {
2476        ret = __block_write_begin(page, pos, len, get_block);
2477        if (unlikely(ret))
2478            goto out_release;
2479        return ret;
2480    }
2481
2482    if (PageMappedToDisk(page))
2483        return 0;
2484
2485    /*
2486     * Allocate buffers so that we can keep track of state, and potentially
2487     * attach them to the page if an error occurs. In the common case of
2488     * no error, they will just be freed again without ever being attached
2489     * to the page (which is all OK, because we're under the page lock).
2490     *
2491     * Be careful: the buffer linked list is a NULL terminated one, rather
2492     * than the circular one we're used to.
2493     */
2494    head = alloc_page_buffers(page, blocksize, 0);
2495    if (!head) {
2496        ret = -ENOMEM;
2497        goto out_release;
2498    }
2499
2500    block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2501
2502    /*
2503     * We loop across all blocks in the page, whether or not they are
2504     * part of the affected region. This is so we can discover if the
2505     * page is fully mapped-to-disk.
2506     */
2507    for (block_start = 0, block_in_page = 0, bh = head;
2508          block_start < PAGE_CACHE_SIZE;
2509          block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2510        int create;
2511
2512        block_end = block_start + blocksize;
2513        bh->b_state = 0;
2514        create = 1;
2515        if (block_start >= to)
2516            create = 0;
2517        ret = get_block(inode, block_in_file + block_in_page,
2518                    bh, create);
2519        if (ret)
2520            goto failed;
2521        if (!buffer_mapped(bh))
2522            is_mapped_to_disk = 0;
2523        if (buffer_new(bh))
2524            unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2525        if (PageUptodate(page)) {
2526            set_buffer_uptodate(bh);
2527            continue;
2528        }
2529        if (buffer_new(bh) || !buffer_mapped(bh)) {
2530            zero_user_segments(page, block_start, from,
2531                            to, block_end);
2532            continue;
2533        }
2534        if (buffer_uptodate(bh))
2535            continue; /* reiserfs does this */
2536        if (block_start < from || block_end > to) {
2537            lock_buffer(bh);
2538            bh->b_end_io = end_buffer_read_nobh;
2539            submit_bh(READ, bh);
2540            nr_reads++;
2541        }
2542    }
2543
2544    if (nr_reads) {
2545        /*
2546         * The page is locked, so these buffers are protected from
2547         * any VM or truncate activity. Hence we don't need to care
2548         * for the buffer_head refcounts.
2549         */
2550        for (bh = head; bh; bh = bh->b_this_page) {
2551            wait_on_buffer(bh);
2552            if (!buffer_uptodate(bh))
2553                ret = -EIO;
2554        }
2555        if (ret)
2556            goto failed;
2557    }
2558
2559    if (is_mapped_to_disk)
2560        SetPageMappedToDisk(page);
2561
2562    *fsdata = head; /* to be released by nobh_write_end */
2563
2564    return 0;
2565
2566failed:
2567    BUG_ON(!ret);
2568    /*
2569     * Error recovery is a bit difficult. We need to zero out blocks that
2570     * were newly allocated, and dirty them to ensure they get written out.
2571     * Buffers need to be attached to the page at this point, otherwise
2572     * the handling of potential IO errors during writeout would be hard
2573     * (could try doing synchronous writeout, but what if that fails too?)
2574     */
2575    attach_nobh_buffers(page, head);
2576    page_zero_new_buffers(page, from, to);
2577
2578out_release:
2579    unlock_page(page);
2580    page_cache_release(page);
2581    *pagep = NULL;
2582
2583    return ret;
2584}
2585EXPORT_SYMBOL(nobh_write_begin);
2586
2587int nobh_write_end(struct file *file, struct address_space *mapping,
2588            loff_t pos, unsigned len, unsigned copied,
2589            struct page *page, void *fsdata)
2590{
2591    struct inode *inode = page->mapping->host;
2592    struct buffer_head *head = fsdata;
2593    struct buffer_head *bh;
2594    BUG_ON(fsdata != NULL && page_has_buffers(page));
2595
2596    if (unlikely(copied < len) && head)
2597        attach_nobh_buffers(page, head);
2598    if (page_has_buffers(page))
2599        return generic_write_end(file, mapping, pos, len,
2600                    copied, page, fsdata);
2601
2602    SetPageUptodate(page);
2603    set_page_dirty(page);
2604    if (pos+copied > inode->i_size) {
2605        i_size_write(inode, pos+copied);
2606        mark_inode_dirty(inode);
2607    }
2608
2609    unlock_page(page);
2610    page_cache_release(page);
2611
2612    while (head) {
2613        bh = head;
2614        head = head->b_this_page;
2615        free_buffer_head(bh);
2616    }
2617
2618    return copied;
2619}
2620EXPORT_SYMBOL(nobh_write_end);
2621
2622/*
2623 * nobh_writepage() - based on block_full_write_page() except
2624 * that it tries to operate without attaching bufferheads to
2625 * the page.
2626 */
2627int nobh_writepage(struct page *page, get_block_t *get_block,
2628            struct writeback_control *wbc)
2629{
2630    struct inode * const inode = page->mapping->host;
2631    loff_t i_size = i_size_read(inode);
2632    const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2633    unsigned offset;
2634    int ret;
2635
2636    /* Is the page fully inside i_size? */
2637    if (page->index < end_index)
2638        goto out;
2639
2640    /* Is the page fully outside i_size? (truncate in progress) */
2641    offset = i_size & (PAGE_CACHE_SIZE-1);
2642    if (page->index >= end_index+1 || !offset) {
2643        /*
2644         * The page may have dirty, unmapped buffers. For example,
2645         * they may have been added in ext3_writepage(). Make them
2646         * freeable here, so the page does not leak.
2647         */
2648#if 0
2649        /* Not really sure about this - do we need this ? */
2650        if (page->mapping->a_ops->invalidatepage)
2651            page->mapping->a_ops->invalidatepage(page, offset);
2652#endif
2653        unlock_page(page);
2654        return 0; /* don't care */
2655    }
2656
2657    /*
2658     * The page straddles i_size. It must be zeroed out on each and every
2659     * writepage invocation because it may be mmapped. "A file is mapped
2660     * in multiples of the page size. For a file that is not a multiple of
2661     * the page size, the remaining memory is zeroed when mapped, and
2662     * writes to that region are not written out to the file."
2663     */
2664    zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2665out:
2666    ret = mpage_writepage(page, get_block, wbc);
2667    if (ret == -EAGAIN)
2668        ret = __block_write_full_page(inode, page, get_block, wbc,
2669                          end_buffer_async_write);
2670    return ret;
2671}
2672EXPORT_SYMBOL(nobh_writepage);
2673
2674int nobh_truncate_page(struct address_space *mapping,
2675            loff_t from, get_block_t *get_block)
2676{
2677    pgoff_t index = from >> PAGE_CACHE_SHIFT;
2678    unsigned offset = from & (PAGE_CACHE_SIZE-1);
2679    unsigned blocksize;
2680    sector_t iblock;
2681    unsigned length, pos;
2682    struct inode *inode = mapping->host;
2683    struct page *page;
2684    struct buffer_head map_bh;
2685    int err;
2686
2687    blocksize = 1 << inode->i_blkbits;
2688    length = offset & (blocksize - 1);
2689
2690    /* Block boundary? Nothing to do */
2691    if (!length)
2692        return 0;
2693
2694    length = blocksize - length;
2695    iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2696
2697    page = grab_cache_page(mapping, index);
2698    err = -ENOMEM;
2699    if (!page)
2700        goto out;
2701
2702    if (page_has_buffers(page)) {
2703has_buffers:
2704        unlock_page(page);
2705        page_cache_release(page);
2706        return block_truncate_page(mapping, from, get_block);
2707    }
2708
2709    /* Find the buffer that contains "offset" */
2710    pos = blocksize;
2711    while (offset >= pos) {
2712        iblock++;
2713        pos += blocksize;
2714    }
2715
2716    map_bh.b_size = blocksize;
2717    map_bh.b_state = 0;
2718    err = get_block(inode, iblock, &map_bh, 0);
2719    if (err)
2720        goto unlock;
2721    /* unmapped? It's a hole - nothing to do */
2722    if (!buffer_mapped(&map_bh))
2723        goto unlock;
2724
2725    /* Ok, it's mapped. Make sure it's up-to-date */
2726    if (!PageUptodate(page)) {
2727        err = mapping->a_ops->readpage(NULL, page);
2728        if (err) {
2729            page_cache_release(page);
2730            goto out;
2731        }
2732        lock_page(page);
2733        if (!PageUptodate(page)) {
2734            err = -EIO;
2735            goto unlock;
2736        }
2737        if (page_has_buffers(page))
2738            goto has_buffers;
2739    }
2740    zero_user(page, offset, length);
2741    set_page_dirty(page);
2742    err = 0;
2743
2744unlock:
2745    unlock_page(page);
2746    page_cache_release(page);
2747out:
2748    return err;
2749}
2750EXPORT_SYMBOL(nobh_truncate_page);
2751
2752int block_truncate_page(struct address_space *mapping,
2753            loff_t from, get_block_t *get_block)
2754{
2755    pgoff_t index = from >> PAGE_CACHE_SHIFT;
2756    unsigned offset = from & (PAGE_CACHE_SIZE-1);
2757    unsigned blocksize;
2758    sector_t iblock;
2759    unsigned length, pos;
2760    struct inode *inode = mapping->host;
2761    struct page *page;
2762    struct buffer_head *bh;
2763    int err;
2764
2765    blocksize = 1 << inode->i_blkbits;
2766    length = offset & (blocksize - 1);
2767
2768    /* Block boundary? Nothing to do */
2769    if (!length)
2770        return 0;
2771
2772    length = blocksize - length;
2773    iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2774    
2775    page = grab_cache_page(mapping, index);
2776    err = -ENOMEM;
2777    if (!page)
2778        goto out;
2779
2780    if (!page_has_buffers(page))
2781        create_empty_buffers(page, blocksize, 0);
2782
2783    /* Find the buffer that contains "offset" */
2784    bh = page_buffers(page);
2785    pos = blocksize;
2786    while (offset >= pos) {
2787        bh = bh->b_this_page;
2788        iblock++;
2789        pos += blocksize;
2790    }
2791
2792    err = 0;
2793    if (!buffer_mapped(bh)) {
2794        WARN_ON(bh->b_size != blocksize);
2795        err = get_block(inode, iblock, bh, 0);
2796        if (err)
2797            goto unlock;
2798        /* unmapped? It's a hole - nothing to do */
2799        if (!buffer_mapped(bh))
2800            goto unlock;
2801    }
2802
2803    /* Ok, it's mapped. Make sure it's up-to-date */
2804    if (PageUptodate(page))
2805        set_buffer_uptodate(bh);
2806
2807    if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2808        err = -EIO;
2809        ll_rw_block(READ, 1, &bh);
2810        wait_on_buffer(bh);
2811        /* Uhhuh. Read error. Complain and punt. */
2812        if (!buffer_uptodate(bh))
2813            goto unlock;
2814    }
2815
2816    zero_user(page, offset, length);
2817    mark_buffer_dirty(bh);
2818    err = 0;
2819
2820unlock:
2821    unlock_page(page);
2822    page_cache_release(page);
2823out:
2824    return err;
2825}
2826EXPORT_SYMBOL(block_truncate_page);
2827
2828/*
2829 * The generic ->writepage function for buffer-backed address_spaces
2830 * this form passes in the end_io handler used to finish the IO.
2831 */
2832int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2833            struct writeback_control *wbc, bh_end_io_t *handler)
2834{
2835    struct inode * const inode = page->mapping->host;
2836    loff_t i_size = i_size_read(inode);
2837    const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2838    unsigned offset;
2839
2840    /* Is the page fully inside i_size? */
2841    if (page->index < end_index)
2842        return __block_write_full_page(inode, page, get_block, wbc,
2843                           handler);
2844
2845    /* Is the page fully outside i_size? (truncate in progress) */
2846    offset = i_size & (PAGE_CACHE_SIZE-1);
2847    if (page->index >= end_index+1 || !offset) {
2848        /*
2849         * The page may have dirty, unmapped buffers. For example,
2850         * they may have been added in ext3_writepage(). Make them
2851         * freeable here, so the page does not leak.
2852         */
2853        do_invalidatepage(page, 0);
2854        unlock_page(page);
2855        return 0; /* don't care */
2856    }
2857
2858    /*
2859     * The page straddles i_size. It must be zeroed out on each and every
2860     * writepage invocation because it may be mmapped. "A file is mapped
2861     * in multiples of the page size. For a file that is not a multiple of
2862     * the page size, the remaining memory is zeroed when mapped, and
2863     * writes to that region are not written out to the file."
2864     */
2865    zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2866    return __block_write_full_page(inode, page, get_block, wbc, handler);
2867}
2868EXPORT_SYMBOL(block_write_full_page_endio);
2869
2870/*
2871 * The generic ->writepage function for buffer-backed address_spaces
2872 */
2873int block_write_full_page(struct page *page, get_block_t *get_block,
2874            struct writeback_control *wbc)
2875{
2876    return block_write_full_page_endio(page, get_block, wbc,
2877                       end_buffer_async_write);
2878}
2879EXPORT_SYMBOL(block_write_full_page);
2880
2881sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2882                get_block_t *get_block)
2883{
2884    struct buffer_head tmp;
2885    struct inode *inode = mapping->host;
2886    tmp.b_state = 0;
2887    tmp.b_blocknr = 0;
2888    tmp.b_size = 1 << inode->i_blkbits;
2889    get_block(inode, block, &tmp, 0);
2890    return tmp.b_blocknr;
2891}
2892EXPORT_SYMBOL(generic_block_bmap);
2893
2894static void end_bio_bh_io_sync(struct bio *bio, int err)
2895{
2896    struct buffer_head *bh = bio->bi_private;
2897
2898    if (err == -EOPNOTSUPP) {
2899        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2900    }
2901
2902    if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2903        set_bit(BH_Quiet, &bh->b_state);
2904
2905    bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2906    bio_put(bio);
2907}
2908
2909int submit_bh(int rw, struct buffer_head * bh)
2910{
2911    struct bio *bio;
2912    int ret = 0;
2913
2914    BUG_ON(!buffer_locked(bh));
2915    BUG_ON(!buffer_mapped(bh));
2916    BUG_ON(!bh->b_end_io);
2917    BUG_ON(buffer_delay(bh));
2918    BUG_ON(buffer_unwritten(bh));
2919
2920    /*
2921     * Only clear out a write error when rewriting
2922     */
2923    if (test_set_buffer_req(bh) && (rw & WRITE))
2924        clear_buffer_write_io_error(bh);
2925
2926    /*
2927     * from here on down, it's all bio -- do the initial mapping,
2928     * submit_bio -> generic_make_request may further map this bio around
2929     */
2930    bio = bio_alloc(GFP_NOIO, 1);
2931
2932    bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2933    bio->bi_bdev = bh->b_bdev;
2934    bio->bi_io_vec[0].bv_page = bh->b_page;
2935    bio->bi_io_vec[0].bv_len = bh->b_size;
2936    bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2937
2938    bio->bi_vcnt = 1;
2939    bio->bi_idx = 0;
2940    bio->bi_size = bh->b_size;
2941
2942    bio->bi_end_io = end_bio_bh_io_sync;
2943    bio->bi_private = bh;
2944
2945    bio_get(bio);
2946    submit_bio(rw, bio);
2947
2948    if (bio_flagged(bio, BIO_EOPNOTSUPP))
2949        ret = -EOPNOTSUPP;
2950
2951    bio_put(bio);
2952    return ret;
2953}
2954EXPORT_SYMBOL(submit_bh);
2955
2956/**
2957 * ll_rw_block: low-level access to block devices (DEPRECATED)
2958 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2959 * @nr: number of &struct buffer_heads in the array
2960 * @bhs: array of pointers to &struct buffer_head
2961 *
2962 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2963 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2964 * %READA option is described in the documentation for generic_make_request()
2965 * which ll_rw_block() calls.
2966 *
2967 * This function drops any buffer that it cannot get a lock on (with the
2968 * BH_Lock state bit), any buffer that appears to be clean when doing a write
2969 * request, and any buffer that appears to be up-to-date when doing read
2970 * request. Further it marks as clean buffers that are processed for
2971 * writing (the buffer cache won't assume that they are actually clean
2972 * until the buffer gets unlocked).
2973 *
2974 * ll_rw_block sets b_end_io to simple completion handler that marks
2975 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2976 * any waiters.
2977 *
2978 * All of the buffers must be for the same device, and must also be a
2979 * multiple of the current approved size for the device.
2980 */
2981void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2982{
2983    int i;
2984
2985    for (i = 0; i < nr; i++) {
2986        struct buffer_head *bh = bhs[i];
2987
2988        if (!trylock_buffer(bh))
2989            continue;
2990        if (rw == WRITE) {
2991            if (test_clear_buffer_dirty(bh)) {
2992                bh->b_end_io = end_buffer_write_sync;
2993                get_bh(bh);
2994                submit_bh(WRITE, bh);
2995                continue;
2996            }
2997        } else {
2998            if (!buffer_uptodate(bh)) {
2999                bh->b_end_io = end_buffer_read_sync;
3000                get_bh(bh);
3001                submit_bh(rw, bh);
3002                continue;
3003            }
3004        }
3005        unlock_buffer(bh);
3006    }
3007}
3008EXPORT_SYMBOL(ll_rw_block);
3009
3010void write_dirty_buffer(struct buffer_head *bh, int rw)
3011{
3012    lock_buffer(bh);
3013    if (!test_clear_buffer_dirty(bh)) {
3014        unlock_buffer(bh);
3015        return;
3016    }
3017    bh->b_end_io = end_buffer_write_sync;
3018    get_bh(bh);
3019    submit_bh(rw, bh);
3020}
3021EXPORT_SYMBOL(write_dirty_buffer);
3022
3023/*
3024 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3025 * and then start new I/O and then wait upon it. The caller must have a ref on
3026 * the buffer_head.
3027 */
3028int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3029{
3030    int ret = 0;
3031
3032    WARN_ON(atomic_read(&bh->b_count) < 1);
3033    lock_buffer(bh);
3034    if (test_clear_buffer_dirty(bh)) {
3035        get_bh(bh);
3036        bh->b_end_io = end_buffer_write_sync;
3037        ret = submit_bh(rw, bh);
3038        wait_on_buffer(bh);
3039        if (!ret && !buffer_uptodate(bh))
3040            ret = -EIO;
3041    } else {
3042        unlock_buffer(bh);
3043    }
3044    return ret;
3045}
3046EXPORT_SYMBOL(__sync_dirty_buffer);
3047
3048int sync_dirty_buffer(struct buffer_head *bh)
3049{
3050    return __sync_dirty_buffer(bh, WRITE_SYNC);
3051}
3052EXPORT_SYMBOL(sync_dirty_buffer);
3053
3054/*
3055 * try_to_free_buffers() checks if all the buffers on this particular page
3056 * are unused, and releases them if so.
3057 *
3058 * Exclusion against try_to_free_buffers may be obtained by either
3059 * locking the page or by holding its mapping's private_lock.
3060 *
3061 * If the page is dirty but all the buffers are clean then we need to
3062 * be sure to mark the page clean as well. This is because the page
3063 * may be against a block device, and a later reattachment of buffers
3064 * to a dirty page will set *all* buffers dirty. Which would corrupt
3065 * filesystem data on the same device.
3066 *
3067 * The same applies to regular filesystem pages: if all the buffers are
3068 * clean then we set the page clean and proceed. To do that, we require
3069 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3070 * private_lock.
3071 *
3072 * try_to_free_buffers() is non-blocking.
3073 */
3074static inline int buffer_busy(struct buffer_head *bh)
3075{
3076    return atomic_read(&bh->b_count) |
3077        (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3078}
3079
3080static int
3081drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3082{
3083    struct buffer_head *head = page_buffers(page);
3084    struct buffer_head *bh;
3085
3086    bh = head;
3087    do {
3088        if (buffer_write_io_error(bh) && page->mapping)
3089            set_bit(AS_EIO, &page->mapping->flags);
3090        if (buffer_busy(bh))
3091            goto failed;
3092        bh = bh->b_this_page;
3093    } while (bh != head);
3094
3095    do {
3096        struct buffer_head *next = bh->b_this_page;
3097
3098        if (bh->b_assoc_map)
3099            __remove_assoc_queue(bh);
3100        bh = next;
3101    } while (bh != head);
3102    *buffers_to_free = head;
3103    __clear_page_buffers(page);
3104    return 1;
3105failed:
3106    return 0;
3107}
3108
3109int try_to_free_buffers(struct page *page)
3110{
3111    struct address_space * const mapping = page->mapping;
3112    struct buffer_head *buffers_to_free = NULL;
3113    int ret = 0;
3114
3115    BUG_ON(!PageLocked(page));
3116    if (PageWriteback(page))
3117        return 0;
3118
3119    if (mapping == NULL) { /* can this still happen? */
3120        ret = drop_buffers(page, &buffers_to_free);
3121        goto out;
3122    }
3123
3124    spin_lock(&mapping->private_lock);
3125    ret = drop_buffers(page, &buffers_to_free);
3126
3127    /*
3128     * If the filesystem writes its buffers by hand (eg ext3)
3129     * then we can have clean buffers against a dirty page. We
3130     * clean the page here; otherwise the VM will never notice
3131     * that the filesystem did any IO at all.
3132     *
3133     * Also, during truncate, discard_buffer will have marked all
3134     * the page's buffers clean. We discover that here and clean
3135     * the page also.
3136     *
3137     * private_lock must be held over this entire operation in order
3138     * to synchronise against __set_page_dirty_buffers and prevent the
3139     * dirty bit from being lost.
3140     */
3141    if (ret)
3142        cancel_dirty_page(page, PAGE_CACHE_SIZE);
3143    spin_unlock(&mapping->private_lock);
3144out:
3145    if (buffers_to_free) {
3146        struct buffer_head *bh = buffers_to_free;
3147
3148        do {
3149            struct buffer_head *next = bh->b_this_page;
3150            free_buffer_head(bh);
3151            bh = next;
3152        } while (bh != buffers_to_free);
3153    }
3154    return ret;
3155}
3156EXPORT_SYMBOL(try_to_free_buffers);
3157
3158/*
3159 * There are no bdflush tunables left. But distributions are
3160 * still running obsolete flush daemons, so we terminate them here.
3161 *
3162 * Use of bdflush() is deprecated and will be removed in a future kernel.
3163 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3164 */
3165SYSCALL_DEFINE2(bdflush, int, func, long, data)
3166{
3167    static int msg_count;
3168
3169    if (!capable(CAP_SYS_ADMIN))
3170        return -EPERM;
3171
3172    if (msg_count < 5) {
3173        msg_count++;
3174        printk(KERN_INFO
3175            "warning: process `%s' used the obsolete bdflush"
3176            " system call\n", current->comm);
3177        printk(KERN_INFO "Fix your initscripts?\n");
3178    }
3179
3180    if (func == 1)
3181        do_exit(0);
3182    return 0;
3183}
3184
3185/*
3186 * Buffer-head allocation
3187 */
3188static struct kmem_cache *bh_cachep;
3189
3190/*
3191 * Once the number of bh's in the machine exceeds this level, we start
3192 * stripping them in writeback.
3193 */
3194static int max_buffer_heads;
3195
3196int buffer_heads_over_limit;
3197
3198struct bh_accounting {
3199    int nr; /* Number of live bh's */
3200    int ratelimit; /* Limit cacheline bouncing */
3201};
3202
3203static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3204
3205static void recalc_bh_state(void)
3206{
3207    int i;
3208    int tot = 0;
3209
3210    if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3211        return;
3212    __this_cpu_write(bh_accounting.ratelimit, 0);
3213    for_each_online_cpu(i)
3214        tot += per_cpu(bh_accounting, i).nr;
3215    buffer_heads_over_limit = (tot > max_buffer_heads);
3216}
3217
3218struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3219{
3220    struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3221    if (ret) {
3222        INIT_LIST_HEAD(&ret->b_assoc_buffers);
3223        preempt_disable();
3224        __this_cpu_inc(bh_accounting.nr);
3225        recalc_bh_state();
3226        preempt_enable();
3227    }
3228    return ret;
3229}
3230EXPORT_SYMBOL(alloc_buffer_head);
3231
3232void free_buffer_head(struct buffer_head *bh)
3233{
3234    BUG_ON(!list_empty(&bh->b_assoc_buffers));
3235    kmem_cache_free(bh_cachep, bh);
3236    preempt_disable();
3237    __this_cpu_dec(bh_accounting.nr);
3238    recalc_bh_state();
3239    preempt_enable();
3240}
3241EXPORT_SYMBOL(free_buffer_head);
3242
3243static void buffer_exit_cpu(int cpu)
3244{
3245    int i;
3246    struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3247
3248    for (i = 0; i < BH_LRU_SIZE; i++) {
3249        brelse(b->bhs[i]);
3250        b->bhs[i] = NULL;
3251    }
3252    this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3253    per_cpu(bh_accounting, cpu).nr = 0;
3254}
3255
3256static int buffer_cpu_notify(struct notifier_block *self,
3257                  unsigned long action, void *hcpu)
3258{
3259    if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3260        buffer_exit_cpu((unsigned long)hcpu);
3261    return NOTIFY_OK;
3262}
3263
3264/**
3265 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3266 * @bh: struct buffer_head
3267 *
3268 * Return true if the buffer is up-to-date and false,
3269 * with the buffer locked, if not.
3270 */
3271int bh_uptodate_or_lock(struct buffer_head *bh)
3272{
3273    if (!buffer_uptodate(bh)) {
3274        lock_buffer(bh);
3275        if (!buffer_uptodate(bh))
3276            return 0;
3277        unlock_buffer(bh);
3278    }
3279    return 1;
3280}
3281EXPORT_SYMBOL(bh_uptodate_or_lock);
3282
3283/**
3284 * bh_submit_read - Submit a locked buffer for reading
3285 * @bh: struct buffer_head
3286 *
3287 * Returns zero on success and -EIO on error.
3288 */
3289int bh_submit_read(struct buffer_head *bh)
3290{
3291    BUG_ON(!buffer_locked(bh));
3292
3293    if (buffer_uptodate(bh)) {
3294        unlock_buffer(bh);
3295        return 0;
3296    }
3297
3298    get_bh(bh);
3299    bh->b_end_io = end_buffer_read_sync;
3300    submit_bh(READ, bh);
3301    wait_on_buffer(bh);
3302    if (buffer_uptodate(bh))
3303        return 0;
3304    return -EIO;
3305}
3306EXPORT_SYMBOL(bh_submit_read);
3307
3308void __init buffer_init(void)
3309{
3310    int nrpages;
3311
3312    bh_cachep = kmem_cache_create("buffer_head",
3313            sizeof(struct buffer_head), 0,
3314                (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3315                SLAB_MEM_SPREAD),
3316                NULL);
3317
3318    /*
3319     * Limit the bh occupancy to 10% of ZONE_NORMAL
3320     */
3321    nrpages = (nr_free_buffer_pages() * 10) / 100;
3322    max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3323    hotcpu_notifier(buffer_cpu_notify, 0);
3324}
3325

Archive Download this file



interactive