Root/fs/fs-writeback.c

1/*
2 * fs/fs-writeback.c
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 *
6 * Contains all the functions related to writing back and waiting
7 * upon dirty inodes against superblocks, and writing back dirty
8 * pages against inodes. ie: data writeback. Writeout of the
9 * inode itself is not handled here.
10 *
11 * 10Apr2002 Andrew Morton
12 * Split out of fs/inode.c
13 * Additions for address_space-based writeback
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/spinlock.h>
19#include <linux/slab.h>
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/mm.h>
23#include <linux/kthread.h>
24#include <linux/freezer.h>
25#include <linux/writeback.h>
26#include <linux/blkdev.h>
27#include <linux/backing-dev.h>
28#include <linux/buffer_head.h>
29#include <linux/tracepoint.h>
30#include "internal.h"
31
32/*
33 * Passed into wb_writeback(), essentially a subset of writeback_control
34 */
35struct wb_writeback_work {
36    long nr_pages;
37    struct super_block *sb;
38    enum writeback_sync_modes sync_mode;
39    unsigned int for_kupdate:1;
40    unsigned int range_cyclic:1;
41    unsigned int for_background:1;
42
43    struct list_head list; /* pending work list */
44    struct completion *done; /* set if the caller waits */
45};
46
47/*
48 * Include the creation of the trace points after defining the
49 * wb_writeback_work structure so that the definition remains local to this
50 * file.
51 */
52#define CREATE_TRACE_POINTS
53#include <trace/events/writeback.h>
54
55/*
56 * We don't actually have pdflush, but this one is exported though /proc...
57 */
58int nr_pdflush_threads;
59
60/**
61 * writeback_in_progress - determine whether there is writeback in progress
62 * @bdi: the device's backing_dev_info structure.
63 *
64 * Determine whether there is writeback waiting to be handled against a
65 * backing device.
66 */
67int writeback_in_progress(struct backing_dev_info *bdi)
68{
69    return test_bit(BDI_writeback_running, &bdi->state);
70}
71
72static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
73{
74    struct super_block *sb = inode->i_sb;
75
76    if (strcmp(sb->s_type->name, "bdev") == 0)
77        return inode->i_mapping->backing_dev_info;
78
79    return sb->s_bdi;
80}
81
82static inline struct inode *wb_inode(struct list_head *head)
83{
84    return list_entry(head, struct inode, i_wb_list);
85}
86
87/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
88static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
89{
90    if (bdi->wb.task) {
91        wake_up_process(bdi->wb.task);
92    } else {
93        /*
94         * The bdi thread isn't there, wake up the forker thread which
95         * will create and run it.
96         */
97        wake_up_process(default_backing_dev_info.wb.task);
98    }
99}
100
101static void bdi_queue_work(struct backing_dev_info *bdi,
102               struct wb_writeback_work *work)
103{
104    trace_writeback_queue(bdi, work);
105
106    spin_lock_bh(&bdi->wb_lock);
107    list_add_tail(&work->list, &bdi->work_list);
108    if (!bdi->wb.task)
109        trace_writeback_nothread(bdi, work);
110    bdi_wakeup_flusher(bdi);
111    spin_unlock_bh(&bdi->wb_lock);
112}
113
114static void
115__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
116              bool range_cyclic)
117{
118    struct wb_writeback_work *work;
119
120    /*
121     * This is WB_SYNC_NONE writeback, so if allocation fails just
122     * wakeup the thread for old dirty data writeback
123     */
124    work = kzalloc(sizeof(*work), GFP_ATOMIC);
125    if (!work) {
126        if (bdi->wb.task) {
127            trace_writeback_nowork(bdi);
128            wake_up_process(bdi->wb.task);
129        }
130        return;
131    }
132
133    work->sync_mode = WB_SYNC_NONE;
134    work->nr_pages = nr_pages;
135    work->range_cyclic = range_cyclic;
136
137    bdi_queue_work(bdi, work);
138}
139
140/**
141 * bdi_start_writeback - start writeback
142 * @bdi: the backing device to write from
143 * @nr_pages: the number of pages to write
144 *
145 * Description:
146 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
147 * started when this function returns, we make no guarantees on
148 * completion. Caller need not hold sb s_umount semaphore.
149 *
150 */
151void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
152{
153    __bdi_start_writeback(bdi, nr_pages, true);
154}
155
156/**
157 * bdi_start_background_writeback - start background writeback
158 * @bdi: the backing device to write from
159 *
160 * Description:
161 * This makes sure WB_SYNC_NONE background writeback happens. When
162 * this function returns, it is only guaranteed that for given BDI
163 * some IO is happening if we are over background dirty threshold.
164 * Caller need not hold sb s_umount semaphore.
165 */
166void bdi_start_background_writeback(struct backing_dev_info *bdi)
167{
168    /*
169     * We just wake up the flusher thread. It will perform background
170     * writeback as soon as there is no other work to do.
171     */
172    trace_writeback_wake_background(bdi);
173    spin_lock_bh(&bdi->wb_lock);
174    bdi_wakeup_flusher(bdi);
175    spin_unlock_bh(&bdi->wb_lock);
176}
177
178/*
179 * Remove the inode from the writeback list it is on.
180 */
181void inode_wb_list_del(struct inode *inode)
182{
183    spin_lock(&inode_wb_list_lock);
184    list_del_init(&inode->i_wb_list);
185    spin_unlock(&inode_wb_list_lock);
186}
187
188
189/*
190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
191 * furthest end of its superblock's dirty-inode list.
192 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when.
197 */
198static void redirty_tail(struct inode *inode)
199{
200    struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201
202    assert_spin_locked(&inode_wb_list_lock);
203    if (!list_empty(&wb->b_dirty)) {
204        struct inode *tail;
205
206        tail = wb_inode(wb->b_dirty.next);
207        if (time_before(inode->dirtied_when, tail->dirtied_when))
208            inode->dirtied_when = jiffies;
209    }
210    list_move(&inode->i_wb_list, &wb->b_dirty);
211}
212
213/*
214 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */
216static void requeue_io(struct inode *inode)
217{
218    struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
219
220    assert_spin_locked(&inode_wb_list_lock);
221    list_move(&inode->i_wb_list, &wb->b_more_io);
222}
223
224static void inode_sync_complete(struct inode *inode)
225{
226    /*
227     * Prevent speculative execution through
228     * spin_unlock(&inode_wb_list_lock);
229     */
230
231    smp_mb();
232    wake_up_bit(&inode->i_state, __I_SYNC);
233}
234
235static bool inode_dirtied_after(struct inode *inode, unsigned long t)
236{
237    bool ret = time_after(inode->dirtied_when, t);
238#ifndef CONFIG_64BIT
239    /*
240     * For inodes being constantly redirtied, dirtied_when can get stuck.
241     * It _appears_ to be in the future, but is actually in distant past.
242     * This test is necessary to prevent such wrapped-around relative times
243     * from permanently stopping the whole bdi writeback.
244     */
245    ret = ret && time_before_eq(inode->dirtied_when, jiffies);
246#endif
247    return ret;
248}
249
250/*
251 * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
252 */
253static void move_expired_inodes(struct list_head *delaying_queue,
254                   struct list_head *dispatch_queue,
255                unsigned long *older_than_this)
256{
257    LIST_HEAD(tmp);
258    struct list_head *pos, *node;
259    struct super_block *sb = NULL;
260    struct inode *inode;
261    int do_sb_sort = 0;
262
263    while (!list_empty(delaying_queue)) {
264        inode = wb_inode(delaying_queue->prev);
265        if (older_than_this &&
266            inode_dirtied_after(inode, *older_than_this))
267            break;
268        if (sb && sb != inode->i_sb)
269            do_sb_sort = 1;
270        sb = inode->i_sb;
271        list_move(&inode->i_wb_list, &tmp);
272    }
273
274    /* just one sb in list, splice to dispatch_queue and we're done */
275    if (!do_sb_sort) {
276        list_splice(&tmp, dispatch_queue);
277        return;
278    }
279
280    /* Move inodes from one superblock together */
281    while (!list_empty(&tmp)) {
282        sb = wb_inode(tmp.prev)->i_sb;
283        list_for_each_prev_safe(pos, node, &tmp) {
284            inode = wb_inode(pos);
285            if (inode->i_sb == sb)
286                list_move(&inode->i_wb_list, dispatch_queue);
287        }
288    }
289}
290
291/*
292 * Queue all expired dirty inodes for io, eldest first.
293 * Before
294 * newly dirtied b_dirty b_io b_more_io
295 * =============> gf edc BA
296 * After
297 * newly dirtied b_dirty b_io b_more_io
298 * =============> g fBAedc
299 * |
300 * +--> dequeue for IO
301 */
302static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
303{
304    assert_spin_locked(&inode_wb_list_lock);
305    list_splice_init(&wb->b_more_io, &wb->b_io);
306    move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
307}
308
309static int write_inode(struct inode *inode, struct writeback_control *wbc)
310{
311    if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
312        return inode->i_sb->s_op->write_inode(inode, wbc);
313    return 0;
314}
315
316/*
317 * Wait for writeback on an inode to complete.
318 */
319static void inode_wait_for_writeback(struct inode *inode)
320{
321    DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
322    wait_queue_head_t *wqh;
323
324    wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
325    while (inode->i_state & I_SYNC) {
326        spin_unlock(&inode->i_lock);
327        spin_unlock(&inode_wb_list_lock);
328        __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
329        spin_lock(&inode_wb_list_lock);
330        spin_lock(&inode->i_lock);
331    }
332}
333
334/*
335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and
336 * inode->i_lock. Either the caller has an active reference on the inode or
337 * the inode has I_WILL_FREE set.
338 *
339 * If `wait' is set, wait on the writeout.
340 *
341 * The whole writeout design is quite complex and fragile. We want to avoid
342 * starvation of particular inodes when others are being redirtied, prevent
343 * livelocks, etc.
344 */
345static int
346writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
347{
348    struct address_space *mapping = inode->i_mapping;
349    unsigned dirty;
350    int ret;
351
352    assert_spin_locked(&inode_wb_list_lock);
353    assert_spin_locked(&inode->i_lock);
354
355    if (!atomic_read(&inode->i_count))
356        WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
357    else
358        WARN_ON(inode->i_state & I_WILL_FREE);
359
360    if (inode->i_state & I_SYNC) {
361        /*
362         * If this inode is locked for writeback and we are not doing
363         * writeback-for-data-integrity, move it to b_more_io so that
364         * writeback can proceed with the other inodes on s_io.
365         *
366         * We'll have another go at writing back this inode when we
367         * completed a full scan of b_io.
368         */
369        if (wbc->sync_mode != WB_SYNC_ALL) {
370            requeue_io(inode);
371            return 0;
372        }
373
374        /*
375         * It's a data-integrity sync. We must wait.
376         */
377        inode_wait_for_writeback(inode);
378    }
379
380    BUG_ON(inode->i_state & I_SYNC);
381
382    /* Set I_SYNC, reset I_DIRTY_PAGES */
383    inode->i_state |= I_SYNC;
384    inode->i_state &= ~I_DIRTY_PAGES;
385    spin_unlock(&inode->i_lock);
386    spin_unlock(&inode_wb_list_lock);
387
388    ret = do_writepages(mapping, wbc);
389
390    /*
391     * Make sure to wait on the data before writing out the metadata.
392     * This is important for filesystems that modify metadata on data
393     * I/O completion.
394     */
395    if (wbc->sync_mode == WB_SYNC_ALL) {
396        int err = filemap_fdatawait(mapping);
397        if (ret == 0)
398            ret = err;
399    }
400
401    /*
402     * Some filesystems may redirty the inode during the writeback
403     * due to delalloc, clear dirty metadata flags right before
404     * write_inode()
405     */
406    spin_lock(&inode->i_lock);
407    dirty = inode->i_state & I_DIRTY;
408    inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
409    spin_unlock(&inode->i_lock);
410    /* Don't write the inode if only I_DIRTY_PAGES was set */
411    if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
412        int err = write_inode(inode, wbc);
413        if (ret == 0)
414            ret = err;
415    }
416
417    spin_lock(&inode_wb_list_lock);
418    spin_lock(&inode->i_lock);
419    inode->i_state &= ~I_SYNC;
420    if (!(inode->i_state & I_FREEING)) {
421        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
422            /*
423             * We didn't write back all the pages. nfs_writepages()
424             * sometimes bales out without doing anything.
425             */
426            inode->i_state |= I_DIRTY_PAGES;
427            if (wbc->nr_to_write <= 0) {
428                /*
429                 * slice used up: queue for next turn
430                 */
431                requeue_io(inode);
432            } else {
433                /*
434                 * Writeback blocked by something other than
435                 * congestion. Delay the inode for some time to
436                 * avoid spinning on the CPU (100% iowait)
437                 * retrying writeback of the dirty page/inode
438                 * that cannot be performed immediately.
439                 */
440                redirty_tail(inode);
441            }
442        } else if (inode->i_state & I_DIRTY) {
443            /*
444             * Filesystems can dirty the inode during writeback
445             * operations, such as delayed allocation during
446             * submission or metadata updates after data IO
447             * completion.
448             */
449            redirty_tail(inode);
450        } else {
451            /*
452             * The inode is clean. At this point we either have
453             * a reference to the inode or it's on it's way out.
454             * No need to add it back to the LRU.
455             */
456            list_del_init(&inode->i_wb_list);
457        }
458    }
459    inode_sync_complete(inode);
460    return ret;
461}
462
463/*
464 * For background writeback the caller does not have the sb pinned
465 * before calling writeback. So make sure that we do pin it, so it doesn't
466 * go away while we are writing inodes from it.
467 */
468static bool pin_sb_for_writeback(struct super_block *sb)
469{
470    spin_lock(&sb_lock);
471    if (list_empty(&sb->s_instances)) {
472        spin_unlock(&sb_lock);
473        return false;
474    }
475
476    sb->s_count++;
477    spin_unlock(&sb_lock);
478
479    if (down_read_trylock(&sb->s_umount)) {
480        if (sb->s_root)
481            return true;
482        up_read(&sb->s_umount);
483    }
484
485    put_super(sb);
486    return false;
487}
488
489/*
490 * Write a portion of b_io inodes which belong to @sb.
491 *
492 * If @only_this_sb is true, then find and write all such
493 * inodes. Otherwise write only ones which go sequentially
494 * in reverse order.
495 *
496 * Return 1, if the caller writeback routine should be
497 * interrupted. Otherwise return 0.
498 */
499static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
500        struct writeback_control *wbc, bool only_this_sb)
501{
502    while (!list_empty(&wb->b_io)) {
503        long pages_skipped;
504        struct inode *inode = wb_inode(wb->b_io.prev);
505
506        if (inode->i_sb != sb) {
507            if (only_this_sb) {
508                /*
509                 * We only want to write back data for this
510                 * superblock, move all inodes not belonging
511                 * to it back onto the dirty list.
512                 */
513                redirty_tail(inode);
514                continue;
515            }
516
517            /*
518             * The inode belongs to a different superblock.
519             * Bounce back to the caller to unpin this and
520             * pin the next superblock.
521             */
522            return 0;
523        }
524
525        /*
526         * Don't bother with new inodes or inodes beeing freed, first
527         * kind does not need peridic writeout yet, and for the latter
528         * kind writeout is handled by the freer.
529         */
530        spin_lock(&inode->i_lock);
531        if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
532            spin_unlock(&inode->i_lock);
533            requeue_io(inode);
534            continue;
535        }
536
537        /*
538         * Was this inode dirtied after sync_sb_inodes was called?
539         * This keeps sync from extra jobs and livelock.
540         */
541        if (inode_dirtied_after(inode, wbc->wb_start)) {
542            spin_unlock(&inode->i_lock);
543            return 1;
544        }
545
546        __iget(inode);
547
548        pages_skipped = wbc->pages_skipped;
549        writeback_single_inode(inode, wbc);
550        if (wbc->pages_skipped != pages_skipped) {
551            /*
552             * writeback is not making progress due to locked
553             * buffers. Skip this inode for now.
554             */
555            redirty_tail(inode);
556        }
557        spin_unlock(&inode->i_lock);
558        spin_unlock(&inode_wb_list_lock);
559        iput(inode);
560        cond_resched();
561        spin_lock(&inode_wb_list_lock);
562        if (wbc->nr_to_write <= 0) {
563            wbc->more_io = 1;
564            return 1;
565        }
566        if (!list_empty(&wb->b_more_io))
567            wbc->more_io = 1;
568    }
569    /* b_io is empty */
570    return 1;
571}
572
573void writeback_inodes_wb(struct bdi_writeback *wb,
574        struct writeback_control *wbc)
575{
576    int ret = 0;
577
578    if (!wbc->wb_start)
579        wbc->wb_start = jiffies; /* livelock avoidance */
580    spin_lock(&inode_wb_list_lock);
581    if (!wbc->for_kupdate || list_empty(&wb->b_io))
582        queue_io(wb, wbc->older_than_this);
583
584    while (!list_empty(&wb->b_io)) {
585        struct inode *inode = wb_inode(wb->b_io.prev);
586        struct super_block *sb = inode->i_sb;
587
588        if (!pin_sb_for_writeback(sb)) {
589            requeue_io(inode);
590            continue;
591        }
592        ret = writeback_sb_inodes(sb, wb, wbc, false);
593        drop_super(sb);
594
595        if (ret)
596            break;
597    }
598    spin_unlock(&inode_wb_list_lock);
599    /* Leave any unwritten inodes on b_io */
600}
601
602static void __writeback_inodes_sb(struct super_block *sb,
603        struct bdi_writeback *wb, struct writeback_control *wbc)
604{
605    WARN_ON(!rwsem_is_locked(&sb->s_umount));
606
607    spin_lock(&inode_wb_list_lock);
608    if (!wbc->for_kupdate || list_empty(&wb->b_io))
609        queue_io(wb, wbc->older_than_this);
610    writeback_sb_inodes(sb, wb, wbc, true);
611    spin_unlock(&inode_wb_list_lock);
612}
613
614/*
615 * The maximum number of pages to writeout in a single bdi flush/kupdate
616 * operation. We do this so we don't hold I_SYNC against an inode for
617 * enormous amounts of time, which would block a userspace task which has
618 * been forced to throttle against that inode. Also, the code reevaluates
619 * the dirty each time it has written this many pages.
620 */
621#define MAX_WRITEBACK_PAGES 1024
622
623static inline bool over_bground_thresh(void)
624{
625    unsigned long background_thresh, dirty_thresh;
626
627    global_dirty_limits(&background_thresh, &dirty_thresh);
628
629    return (global_page_state(NR_FILE_DIRTY) +
630        global_page_state(NR_UNSTABLE_NFS) > background_thresh);
631}
632
633/*
634 * Explicit flushing or periodic writeback of "old" data.
635 *
636 * Define "old": the first time one of an inode's pages is dirtied, we mark the
637 * dirtying-time in the inode's address_space. So this periodic writeback code
638 * just walks the superblock inode list, writing back any inodes which are
639 * older than a specific point in time.
640 *
641 * Try to run once per dirty_writeback_interval. But if a writeback event
642 * takes longer than a dirty_writeback_interval interval, then leave a
643 * one-second gap.
644 *
645 * older_than_this takes precedence over nr_to_write. So we'll only write back
646 * all dirty pages if they are all attached to "old" mappings.
647 */
648static long wb_writeback(struct bdi_writeback *wb,
649             struct wb_writeback_work *work)
650{
651    struct writeback_control wbc = {
652        .sync_mode = work->sync_mode,
653        .older_than_this = NULL,
654        .for_kupdate = work->for_kupdate,
655        .for_background = work->for_background,
656        .range_cyclic = work->range_cyclic,
657    };
658    unsigned long oldest_jif;
659    long wrote = 0;
660    long write_chunk;
661    struct inode *inode;
662
663    if (wbc.for_kupdate) {
664        wbc.older_than_this = &oldest_jif;
665        oldest_jif = jiffies -
666                msecs_to_jiffies(dirty_expire_interval * 10);
667    }
668    if (!wbc.range_cyclic) {
669        wbc.range_start = 0;
670        wbc.range_end = LLONG_MAX;
671    }
672
673    /*
674     * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
675     * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
676     * here avoids calling into writeback_inodes_wb() more than once.
677     *
678     * The intended call sequence for WB_SYNC_ALL writeback is:
679     *
680     * wb_writeback()
681     * __writeback_inodes_sb() <== called only once
682     * write_cache_pages() <== called once for each inode
683     * (quickly) tag currently dirty pages
684     * (maybe slowly) sync all tagged pages
685     */
686    if (wbc.sync_mode == WB_SYNC_NONE)
687        write_chunk = MAX_WRITEBACK_PAGES;
688    else
689        write_chunk = LONG_MAX;
690
691    wbc.wb_start = jiffies; /* livelock avoidance */
692    for (;;) {
693        /*
694         * Stop writeback when nr_pages has been consumed
695         */
696        if (work->nr_pages <= 0)
697            break;
698
699        /*
700         * Background writeout and kupdate-style writeback may
701         * run forever. Stop them if there is other work to do
702         * so that e.g. sync can proceed. They'll be restarted
703         * after the other works are all done.
704         */
705        if ((work->for_background || work->for_kupdate) &&
706            !list_empty(&wb->bdi->work_list))
707            break;
708
709        /*
710         * For background writeout, stop when we are below the
711         * background dirty threshold
712         */
713        if (work->for_background && !over_bground_thresh())
714            break;
715
716        wbc.more_io = 0;
717        wbc.nr_to_write = write_chunk;
718        wbc.pages_skipped = 0;
719
720        trace_wbc_writeback_start(&wbc, wb->bdi);
721        if (work->sb)
722            __writeback_inodes_sb(work->sb, wb, &wbc);
723        else
724            writeback_inodes_wb(wb, &wbc);
725        trace_wbc_writeback_written(&wbc, wb->bdi);
726
727        work->nr_pages -= write_chunk - wbc.nr_to_write;
728        wrote += write_chunk - wbc.nr_to_write;
729
730        /*
731         * If we consumed everything, see if we have more
732         */
733        if (wbc.nr_to_write <= 0)
734            continue;
735        /*
736         * Didn't write everything and we don't have more IO, bail
737         */
738        if (!wbc.more_io)
739            break;
740        /*
741         * Did we write something? Try for more
742         */
743        if (wbc.nr_to_write < write_chunk)
744            continue;
745        /*
746         * Nothing written. Wait for some inode to
747         * become available for writeback. Otherwise
748         * we'll just busyloop.
749         */
750        spin_lock(&inode_wb_list_lock);
751        if (!list_empty(&wb->b_more_io)) {
752            inode = wb_inode(wb->b_more_io.prev);
753            trace_wbc_writeback_wait(&wbc, wb->bdi);
754            spin_lock(&inode->i_lock);
755            inode_wait_for_writeback(inode);
756            spin_unlock(&inode->i_lock);
757        }
758        spin_unlock(&inode_wb_list_lock);
759    }
760
761    return wrote;
762}
763
764/*
765 * Return the next wb_writeback_work struct that hasn't been processed yet.
766 */
767static struct wb_writeback_work *
768get_next_work_item(struct backing_dev_info *bdi)
769{
770    struct wb_writeback_work *work = NULL;
771
772    spin_lock_bh(&bdi->wb_lock);
773    if (!list_empty(&bdi->work_list)) {
774        work = list_entry(bdi->work_list.next,
775                  struct wb_writeback_work, list);
776        list_del_init(&work->list);
777    }
778    spin_unlock_bh(&bdi->wb_lock);
779    return work;
780}
781
782/*
783 * Add in the number of potentially dirty inodes, because each inode
784 * write can dirty pagecache in the underlying blockdev.
785 */
786static unsigned long get_nr_dirty_pages(void)
787{
788    return global_page_state(NR_FILE_DIRTY) +
789        global_page_state(NR_UNSTABLE_NFS) +
790        get_nr_dirty_inodes();
791}
792
793static long wb_check_background_flush(struct bdi_writeback *wb)
794{
795    if (over_bground_thresh()) {
796
797        struct wb_writeback_work work = {
798            .nr_pages = LONG_MAX,
799            .sync_mode = WB_SYNC_NONE,
800            .for_background = 1,
801            .range_cyclic = 1,
802        };
803
804        return wb_writeback(wb, &work);
805    }
806
807    return 0;
808}
809
810static long wb_check_old_data_flush(struct bdi_writeback *wb)
811{
812    unsigned long expired;
813    long nr_pages;
814
815    /*
816     * When set to zero, disable periodic writeback
817     */
818    if (!dirty_writeback_interval)
819        return 0;
820
821    expired = wb->last_old_flush +
822            msecs_to_jiffies(dirty_writeback_interval * 10);
823    if (time_before(jiffies, expired))
824        return 0;
825
826    wb->last_old_flush = jiffies;
827    nr_pages = get_nr_dirty_pages();
828
829    if (nr_pages) {
830        struct wb_writeback_work work = {
831            .nr_pages = nr_pages,
832            .sync_mode = WB_SYNC_NONE,
833            .for_kupdate = 1,
834            .range_cyclic = 1,
835        };
836
837        return wb_writeback(wb, &work);
838    }
839
840    return 0;
841}
842
843/*
844 * Retrieve work items and do the writeback they describe
845 */
846long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
847{
848    struct backing_dev_info *bdi = wb->bdi;
849    struct wb_writeback_work *work;
850    long wrote = 0;
851
852    set_bit(BDI_writeback_running, &wb->bdi->state);
853    while ((work = get_next_work_item(bdi)) != NULL) {
854        /*
855         * Override sync mode, in case we must wait for completion
856         * because this thread is exiting now.
857         */
858        if (force_wait)
859            work->sync_mode = WB_SYNC_ALL;
860
861        trace_writeback_exec(bdi, work);
862
863        wrote += wb_writeback(wb, work);
864
865        /*
866         * Notify the caller of completion if this is a synchronous
867         * work item, otherwise just free it.
868         */
869        if (work->done)
870            complete(work->done);
871        else
872            kfree(work);
873    }
874
875    /*
876     * Check for periodic writeback, kupdated() style
877     */
878    wrote += wb_check_old_data_flush(wb);
879    wrote += wb_check_background_flush(wb);
880    clear_bit(BDI_writeback_running, &wb->bdi->state);
881
882    return wrote;
883}
884
885/*
886 * Handle writeback of dirty data for the device backed by this bdi. Also
887 * wakes up periodically and does kupdated style flushing.
888 */
889int bdi_writeback_thread(void *data)
890{
891    struct bdi_writeback *wb = data;
892    struct backing_dev_info *bdi = wb->bdi;
893    long pages_written;
894
895    current->flags |= PF_SWAPWRITE;
896    set_freezable();
897    wb->last_active = jiffies;
898
899    /*
900     * Our parent may run at a different priority, just set us to normal
901     */
902    set_user_nice(current, 0);
903
904    trace_writeback_thread_start(bdi);
905
906    while (!kthread_should_stop()) {
907        /*
908         * Remove own delayed wake-up timer, since we are already awake
909         * and we'll take care of the preriodic write-back.
910         */
911        del_timer(&wb->wakeup_timer);
912
913        pages_written = wb_do_writeback(wb, 0);
914
915        trace_writeback_pages_written(pages_written);
916
917        if (pages_written)
918            wb->last_active = jiffies;
919
920        set_current_state(TASK_INTERRUPTIBLE);
921        if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
922            __set_current_state(TASK_RUNNING);
923            continue;
924        }
925
926        if (wb_has_dirty_io(wb) && dirty_writeback_interval)
927            schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
928        else {
929            /*
930             * We have nothing to do, so can go sleep without any
931             * timeout and save power. When a work is queued or
932             * something is made dirty - we will be woken up.
933             */
934            schedule();
935        }
936
937        try_to_freeze();
938    }
939
940    /* Flush any work that raced with us exiting */
941    if (!list_empty(&bdi->work_list))
942        wb_do_writeback(wb, 1);
943
944    trace_writeback_thread_stop(bdi);
945    return 0;
946}
947
948
949/*
950 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
951 * the whole world.
952 */
953void wakeup_flusher_threads(long nr_pages)
954{
955    struct backing_dev_info *bdi;
956
957    if (!nr_pages) {
958        nr_pages = global_page_state(NR_FILE_DIRTY) +
959                global_page_state(NR_UNSTABLE_NFS);
960    }
961
962    rcu_read_lock();
963    list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
964        if (!bdi_has_dirty_io(bdi))
965            continue;
966        __bdi_start_writeback(bdi, nr_pages, false);
967    }
968    rcu_read_unlock();
969}
970
971static noinline void block_dump___mark_inode_dirty(struct inode *inode)
972{
973    if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
974        struct dentry *dentry;
975        const char *name = "?";
976
977        dentry = d_find_alias(inode);
978        if (dentry) {
979            spin_lock(&dentry->d_lock);
980            name = (const char *) dentry->d_name.name;
981        }
982        printk(KERN_DEBUG
983               "%s(%d): dirtied inode %lu (%s) on %s\n",
984               current->comm, task_pid_nr(current), inode->i_ino,
985               name, inode->i_sb->s_id);
986        if (dentry) {
987            spin_unlock(&dentry->d_lock);
988            dput(dentry);
989        }
990    }
991}
992
993/**
994 * __mark_inode_dirty - internal function
995 * @inode: inode to mark
996 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
997 * Mark an inode as dirty. Callers should use mark_inode_dirty or
998 * mark_inode_dirty_sync.
999 *
1000 * Put the inode on the super block's dirty list.
1001 *
1002 * CAREFUL! We mark it dirty unconditionally, but move it onto the
1003 * dirty list only if it is hashed or if it refers to a blockdev.
1004 * If it was not hashed, it will never be added to the dirty list
1005 * even if it is later hashed, as it will have been marked dirty already.
1006 *
1007 * In short, make sure you hash any inodes _before_ you start marking
1008 * them dirty.
1009 *
1010 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
1011 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
1012 * the kernel-internal blockdev inode represents the dirtying time of the
1013 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
1014 * page->mapping->host, so the page-dirtying time is recorded in the internal
1015 * blockdev inode.
1016 */
1017void __mark_inode_dirty(struct inode *inode, int flags)
1018{
1019    struct super_block *sb = inode->i_sb;
1020    struct backing_dev_info *bdi = NULL;
1021
1022    /*
1023     * Don't do this for I_DIRTY_PAGES - that doesn't actually
1024     * dirty the inode itself
1025     */
1026    if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1027        if (sb->s_op->dirty_inode)
1028            sb->s_op->dirty_inode(inode, flags);
1029    }
1030
1031    /*
1032     * make sure that changes are seen by all cpus before we test i_state
1033     * -- mikulas
1034     */
1035    smp_mb();
1036
1037    /* avoid the locking if we can */
1038    if ((inode->i_state & flags) == flags)
1039        return;
1040
1041    if (unlikely(block_dump))
1042        block_dump___mark_inode_dirty(inode);
1043
1044    spin_lock(&inode->i_lock);
1045    if ((inode->i_state & flags) != flags) {
1046        const int was_dirty = inode->i_state & I_DIRTY;
1047
1048        inode->i_state |= flags;
1049
1050        /*
1051         * If the inode is being synced, just update its dirty state.
1052         * The unlocker will place the inode on the appropriate
1053         * superblock list, based upon its state.
1054         */
1055        if (inode->i_state & I_SYNC)
1056            goto out_unlock_inode;
1057
1058        /*
1059         * Only add valid (hashed) inodes to the superblock's
1060         * dirty list. Add blockdev inodes as well.
1061         */
1062        if (!S_ISBLK(inode->i_mode)) {
1063            if (inode_unhashed(inode))
1064                goto out_unlock_inode;
1065        }
1066        if (inode->i_state & I_FREEING)
1067            goto out_unlock_inode;
1068
1069        /*
1070         * If the inode was already on b_dirty/b_io/b_more_io, don't
1071         * reposition it (that would break b_dirty time-ordering).
1072         */
1073        if (!was_dirty) {
1074            bool wakeup_bdi = false;
1075            bdi = inode_to_bdi(inode);
1076
1077            if (bdi_cap_writeback_dirty(bdi)) {
1078                WARN(!test_bit(BDI_registered, &bdi->state),
1079                     "bdi-%s not registered\n", bdi->name);
1080
1081                /*
1082                 * If this is the first dirty inode for this
1083                 * bdi, we have to wake-up the corresponding
1084                 * bdi thread to make sure background
1085                 * write-back happens later.
1086                 */
1087                if (!wb_has_dirty_io(&bdi->wb))
1088                    wakeup_bdi = true;
1089            }
1090
1091            spin_unlock(&inode->i_lock);
1092            spin_lock(&inode_wb_list_lock);
1093            inode->dirtied_when = jiffies;
1094            list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1095            spin_unlock(&inode_wb_list_lock);
1096
1097            if (wakeup_bdi)
1098                bdi_wakeup_thread_delayed(bdi);
1099            return;
1100        }
1101    }
1102out_unlock_inode:
1103    spin_unlock(&inode->i_lock);
1104
1105}
1106EXPORT_SYMBOL(__mark_inode_dirty);
1107
1108/*
1109 * Write out a superblock's list of dirty inodes. A wait will be performed
1110 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1111 *
1112 * If older_than_this is non-NULL, then only write out inodes which
1113 * had their first dirtying at a time earlier than *older_than_this.
1114 *
1115 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
1116 * This function assumes that the blockdev superblock's inodes are backed by
1117 * a variety of queues, so all inodes are searched. For other superblocks,
1118 * assume that all inodes are backed by the same queue.
1119 *
1120 * The inodes to be written are parked on bdi->b_io. They are moved back onto
1121 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1122 * on the writer throttling path, and we get decent balancing between many
1123 * throttled threads: we don't want them all piling up on inode_sync_wait.
1124 */
1125static void wait_sb_inodes(struct super_block *sb)
1126{
1127    struct inode *inode, *old_inode = NULL;
1128
1129    /*
1130     * We need to be protected against the filesystem going from
1131     * r/o to r/w or vice versa.
1132     */
1133    WARN_ON(!rwsem_is_locked(&sb->s_umount));
1134
1135    spin_lock(&inode_sb_list_lock);
1136
1137    /*
1138     * Data integrity sync. Must wait for all pages under writeback,
1139     * because there may have been pages dirtied before our sync
1140     * call, but which had writeout started before we write it out.
1141     * In which case, the inode may not be on the dirty list, but
1142     * we still have to wait for that writeout.
1143     */
1144    list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1145        struct address_space *mapping = inode->i_mapping;
1146
1147        spin_lock(&inode->i_lock);
1148        if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
1149            (mapping->nrpages == 0)) {
1150            spin_unlock(&inode->i_lock);
1151            continue;
1152        }
1153        __iget(inode);
1154        spin_unlock(&inode->i_lock);
1155        spin_unlock(&inode_sb_list_lock);
1156
1157        /*
1158         * We hold a reference to 'inode' so it couldn't have been
1159         * removed from s_inodes list while we dropped the
1160         * inode_sb_list_lock. We cannot iput the inode now as we can
1161         * be holding the last reference and we cannot iput it under
1162         * inode_sb_list_lock. So we keep the reference and iput it
1163         * later.
1164         */
1165        iput(old_inode);
1166        old_inode = inode;
1167
1168        filemap_fdatawait(mapping);
1169
1170        cond_resched();
1171
1172        spin_lock(&inode_sb_list_lock);
1173    }
1174    spin_unlock(&inode_sb_list_lock);
1175    iput(old_inode);
1176}
1177
1178/**
1179 * writeback_inodes_sb_nr - writeback dirty inodes from given super_block
1180 * @sb: the superblock
1181 * @nr: the number of pages to write
1182 *
1183 * Start writeback on some inodes on this super_block. No guarantees are made
1184 * on how many (if any) will be written, and this function does not wait
1185 * for IO completion of submitted IO.
1186 */
1187void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
1188{
1189    DECLARE_COMPLETION_ONSTACK(done);
1190    struct wb_writeback_work work = {
1191        .sb = sb,
1192        .sync_mode = WB_SYNC_NONE,
1193        .done = &done,
1194        .nr_pages = nr,
1195    };
1196
1197    WARN_ON(!rwsem_is_locked(&sb->s_umount));
1198    bdi_queue_work(sb->s_bdi, &work);
1199    wait_for_completion(&done);
1200}
1201EXPORT_SYMBOL(writeback_inodes_sb_nr);
1202
1203/**
1204 * writeback_inodes_sb - writeback dirty inodes from given super_block
1205 * @sb: the superblock
1206 *
1207 * Start writeback on some inodes on this super_block. No guarantees are made
1208 * on how many (if any) will be written, and this function does not wait
1209 * for IO completion of submitted IO.
1210 */
1211void writeback_inodes_sb(struct super_block *sb)
1212{
1213    return writeback_inodes_sb_nr(sb, get_nr_dirty_pages());
1214}
1215EXPORT_SYMBOL(writeback_inodes_sb);
1216
1217/**
1218 * writeback_inodes_sb_if_idle - start writeback if none underway
1219 * @sb: the superblock
1220 *
1221 * Invoke writeback_inodes_sb if no writeback is currently underway.
1222 * Returns 1 if writeback was started, 0 if not.
1223 */
1224int writeback_inodes_sb_if_idle(struct super_block *sb)
1225{
1226    if (!writeback_in_progress(sb->s_bdi)) {
1227        down_read(&sb->s_umount);
1228        writeback_inodes_sb(sb);
1229        up_read(&sb->s_umount);
1230        return 1;
1231    } else
1232        return 0;
1233}
1234EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
1235
1236/**
1237 * writeback_inodes_sb_if_idle - start writeback if none underway
1238 * @sb: the superblock
1239 * @nr: the number of pages to write
1240 *
1241 * Invoke writeback_inodes_sb if no writeback is currently underway.
1242 * Returns 1 if writeback was started, 0 if not.
1243 */
1244int writeback_inodes_sb_nr_if_idle(struct super_block *sb,
1245                   unsigned long nr)
1246{
1247    if (!writeback_in_progress(sb->s_bdi)) {
1248        down_read(&sb->s_umount);
1249        writeback_inodes_sb_nr(sb, nr);
1250        up_read(&sb->s_umount);
1251        return 1;
1252    } else
1253        return 0;
1254}
1255EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle);
1256
1257/**
1258 * sync_inodes_sb - sync sb inode pages
1259 * @sb: the superblock
1260 *
1261 * This function writes and waits on any dirty inode belonging to this
1262 * super_block.
1263 */
1264void sync_inodes_sb(struct super_block *sb)
1265{
1266    DECLARE_COMPLETION_ONSTACK(done);
1267    struct wb_writeback_work work = {
1268        .sb = sb,
1269        .sync_mode = WB_SYNC_ALL,
1270        .nr_pages = LONG_MAX,
1271        .range_cyclic = 0,
1272        .done = &done,
1273    };
1274
1275    WARN_ON(!rwsem_is_locked(&sb->s_umount));
1276
1277    bdi_queue_work(sb->s_bdi, &work);
1278    wait_for_completion(&done);
1279
1280    wait_sb_inodes(sb);
1281}
1282EXPORT_SYMBOL(sync_inodes_sb);
1283
1284/**
1285 * write_inode_now - write an inode to disk
1286 * @inode: inode to write to disk
1287 * @sync: whether the write should be synchronous or not
1288 *
1289 * This function commits an inode to disk immediately if it is dirty. This is
1290 * primarily needed by knfsd.
1291 *
1292 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
1293 */
1294int write_inode_now(struct inode *inode, int sync)
1295{
1296    int ret;
1297    struct writeback_control wbc = {
1298        .nr_to_write = LONG_MAX,
1299        .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
1300        .range_start = 0,
1301        .range_end = LLONG_MAX,
1302    };
1303
1304    if (!mapping_cap_writeback_dirty(inode->i_mapping))
1305        wbc.nr_to_write = 0;
1306
1307    might_sleep();
1308    spin_lock(&inode_wb_list_lock);
1309    spin_lock(&inode->i_lock);
1310    ret = writeback_single_inode(inode, &wbc);
1311    spin_unlock(&inode->i_lock);
1312    spin_unlock(&inode_wb_list_lock);
1313    if (sync)
1314        inode_sync_wait(inode);
1315    return ret;
1316}
1317EXPORT_SYMBOL(write_inode_now);
1318
1319/**
1320 * sync_inode - write an inode and its pages to disk.
1321 * @inode: the inode to sync
1322 * @wbc: controls the writeback mode
1323 *
1324 * sync_inode() will write an inode and its pages to disk. It will also
1325 * correctly update the inode on its superblock's dirty inode lists and will
1326 * update inode->i_state.
1327 *
1328 * The caller must have a ref on the inode.
1329 */
1330int sync_inode(struct inode *inode, struct writeback_control *wbc)
1331{
1332    int ret;
1333
1334    spin_lock(&inode_wb_list_lock);
1335    spin_lock(&inode->i_lock);
1336    ret = writeback_single_inode(inode, wbc);
1337    spin_unlock(&inode->i_lock);
1338    spin_unlock(&inode_wb_list_lock);
1339    return ret;
1340}
1341EXPORT_SYMBOL(sync_inode);
1342
1343/**
1344 * sync_inode_metadata - write an inode to disk
1345 * @inode: the inode to sync
1346 * @wait: wait for I/O to complete.
1347 *
1348 * Write an inode to disk and adjust its dirty state after completion.
1349 *
1350 * Note: only writes the actual inode, no associated data or other metadata.
1351 */
1352int sync_inode_metadata(struct inode *inode, int wait)
1353{
1354    struct writeback_control wbc = {
1355        .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
1356        .nr_to_write = 0, /* metadata-only */
1357    };
1358
1359    return sync_inode(inode, &wbc);
1360}
1361EXPORT_SYMBOL(sync_inode_metadata);
1362

Archive Download this file



interactive