Root/drivers/md/raid5.c

1/*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21/*
22 * BITMAP UNPLUGGING:
23 *
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
26 * explanation.
27 *
28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->seq_write is the number of the last batch successfully written.
31 * conf->seq_flush is the number of the last batch that was closed to
32 * new additions.
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is seq_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
39 * batch.
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits.
44 */
45
46#include <linux/blkdev.h>
47#include <linux/kthread.h>
48#include <linux/raid/pq.h>
49#include <linux/async_tx.h>
50#include <linux/module.h>
51#include <linux/async.h>
52#include <linux/seq_file.h>
53#include <linux/cpu.h>
54#include <linux/slab.h>
55#include <linux/ratelimit.h>
56#include "md.h"
57#include "raid5.h"
58#include "raid0.h"
59#include "bitmap.h"
60
61/*
62 * Stripe cache
63 */
64
65#define NR_STRIPES 256
66#define STRIPE_SIZE PAGE_SIZE
67#define STRIPE_SHIFT (PAGE_SHIFT - 9)
68#define STRIPE_SECTORS (STRIPE_SIZE>>9)
69#define IO_THRESHOLD 1
70#define BYPASS_THRESHOLD 1
71#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
72#define HASH_MASK (NR_HASH - 1)
73
74static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
75{
76    int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
77    return &conf->stripe_hashtbl[hash];
78}
79
80/* bio's attached to a stripe+device for I/O are linked together in bi_sector
81 * order without overlap. There may be several bio's per stripe+device, and
82 * a bio could span several devices.
83 * When walking this list for a particular stripe+device, we must never proceed
84 * beyond a bio that extends past this device, as the next bio might no longer
85 * be valid.
86 * This function is used to determine the 'next' bio in the list, given the sector
87 * of the current stripe+device
88 */
89static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
90{
91    int sectors = bio->bi_size >> 9;
92    if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
93        return bio->bi_next;
94    else
95        return NULL;
96}
97
98/*
99 * We maintain a biased count of active stripes in the bottom 16 bits of
100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
101 */
102static inline int raid5_bi_processed_stripes(struct bio *bio)
103{
104    atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
105    return (atomic_read(segments) >> 16) & 0xffff;
106}
107
108static inline int raid5_dec_bi_active_stripes(struct bio *bio)
109{
110    atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
111    return atomic_sub_return(1, segments) & 0xffff;
112}
113
114static inline void raid5_inc_bi_active_stripes(struct bio *bio)
115{
116    atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
117    atomic_inc(segments);
118}
119
120static inline void raid5_set_bi_processed_stripes(struct bio *bio,
121    unsigned int cnt)
122{
123    atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
124    int old, new;
125
126    do {
127        old = atomic_read(segments);
128        new = (old & 0xffff) | (cnt << 16);
129    } while (atomic_cmpxchg(segments, old, new) != old);
130}
131
132static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
133{
134    atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
135    atomic_set(segments, cnt);
136}
137
138/* Find first data disk in a raid6 stripe */
139static inline int raid6_d0(struct stripe_head *sh)
140{
141    if (sh->ddf_layout)
142        /* ddf always start from first device */
143        return 0;
144    /* md starts just after Q block */
145    if (sh->qd_idx == sh->disks - 1)
146        return 0;
147    else
148        return sh->qd_idx + 1;
149}
150static inline int raid6_next_disk(int disk, int raid_disks)
151{
152    disk++;
153    return (disk < raid_disks) ? disk : 0;
154}
155
156/* When walking through the disks in a raid5, starting at raid6_d0,
157 * We need to map each disk to a 'slot', where the data disks are slot
158 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
159 * is raid_disks-1. This help does that mapping.
160 */
161static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
162                 int *count, int syndrome_disks)
163{
164    int slot = *count;
165
166    if (sh->ddf_layout)
167        (*count)++;
168    if (idx == sh->pd_idx)
169        return syndrome_disks;
170    if (idx == sh->qd_idx)
171        return syndrome_disks + 1;
172    if (!sh->ddf_layout)
173        (*count)++;
174    return slot;
175}
176
177static void return_io(struct bio *return_bi)
178{
179    struct bio *bi = return_bi;
180    while (bi) {
181
182        return_bi = bi->bi_next;
183        bi->bi_next = NULL;
184        bi->bi_size = 0;
185        bio_endio(bi, 0);
186        bi = return_bi;
187    }
188}
189
190static void print_raid5_conf (struct r5conf *conf);
191
192static int stripe_operations_active(struct stripe_head *sh)
193{
194    return sh->check_state || sh->reconstruct_state ||
195           test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
196           test_bit(STRIPE_COMPUTE_RUN, &sh->state);
197}
198
199static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
200{
201    BUG_ON(!list_empty(&sh->lru));
202    BUG_ON(atomic_read(&conf->active_stripes)==0);
203    if (test_bit(STRIPE_HANDLE, &sh->state)) {
204        if (test_bit(STRIPE_DELAYED, &sh->state) &&
205            !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
206            list_add_tail(&sh->lru, &conf->delayed_list);
207        else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
208               sh->bm_seq - conf->seq_write > 0)
209            list_add_tail(&sh->lru, &conf->bitmap_list);
210        else {
211            clear_bit(STRIPE_DELAYED, &sh->state);
212            clear_bit(STRIPE_BIT_DELAY, &sh->state);
213            list_add_tail(&sh->lru, &conf->handle_list);
214        }
215        md_wakeup_thread(conf->mddev->thread);
216    } else {
217        BUG_ON(stripe_operations_active(sh));
218        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
219            if (atomic_dec_return(&conf->preread_active_stripes)
220                < IO_THRESHOLD)
221                md_wakeup_thread(conf->mddev->thread);
222        atomic_dec(&conf->active_stripes);
223        if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
224            list_add_tail(&sh->lru, &conf->inactive_list);
225            wake_up(&conf->wait_for_stripe);
226            if (conf->retry_read_aligned)
227                md_wakeup_thread(conf->mddev->thread);
228        }
229    }
230}
231
232static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
233{
234    if (atomic_dec_and_test(&sh->count))
235        do_release_stripe(conf, sh);
236}
237
238static void release_stripe(struct stripe_head *sh)
239{
240    struct r5conf *conf = sh->raid_conf;
241    unsigned long flags;
242
243    local_irq_save(flags);
244    if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
245        do_release_stripe(conf, sh);
246        spin_unlock(&conf->device_lock);
247    }
248    local_irq_restore(flags);
249}
250
251static inline void remove_hash(struct stripe_head *sh)
252{
253    pr_debug("remove_hash(), stripe %llu\n",
254        (unsigned long long)sh->sector);
255
256    hlist_del_init(&sh->hash);
257}
258
259static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
260{
261    struct hlist_head *hp = stripe_hash(conf, sh->sector);
262
263    pr_debug("insert_hash(), stripe %llu\n",
264        (unsigned long long)sh->sector);
265
266    hlist_add_head(&sh->hash, hp);
267}
268
269
270/* find an idle stripe, make sure it is unhashed, and return it. */
271static struct stripe_head *get_free_stripe(struct r5conf *conf)
272{
273    struct stripe_head *sh = NULL;
274    struct list_head *first;
275
276    if (list_empty(&conf->inactive_list))
277        goto out;
278    first = conf->inactive_list.next;
279    sh = list_entry(first, struct stripe_head, lru);
280    list_del_init(first);
281    remove_hash(sh);
282    atomic_inc(&conf->active_stripes);
283out:
284    return sh;
285}
286
287static void shrink_buffers(struct stripe_head *sh)
288{
289    struct page *p;
290    int i;
291    int num = sh->raid_conf->pool_size;
292
293    for (i = 0; i < num ; i++) {
294        p = sh->dev[i].page;
295        if (!p)
296            continue;
297        sh->dev[i].page = NULL;
298        put_page(p);
299    }
300}
301
302static int grow_buffers(struct stripe_head *sh)
303{
304    int i;
305    int num = sh->raid_conf->pool_size;
306
307    for (i = 0; i < num; i++) {
308        struct page *page;
309
310        if (!(page = alloc_page(GFP_KERNEL))) {
311            return 1;
312        }
313        sh->dev[i].page = page;
314    }
315    return 0;
316}
317
318static void raid5_build_block(struct stripe_head *sh, int i, int previous);
319static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
320                struct stripe_head *sh);
321
322static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
323{
324    struct r5conf *conf = sh->raid_conf;
325    int i;
326
327    BUG_ON(atomic_read(&sh->count) != 0);
328    BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
329    BUG_ON(stripe_operations_active(sh));
330
331    pr_debug("init_stripe called, stripe %llu\n",
332        (unsigned long long)sh->sector);
333
334    remove_hash(sh);
335
336    sh->generation = conf->generation - previous;
337    sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
338    sh->sector = sector;
339    stripe_set_idx(sector, conf, previous, sh);
340    sh->state = 0;
341
342
343    for (i = sh->disks; i--; ) {
344        struct r5dev *dev = &sh->dev[i];
345
346        if (dev->toread || dev->read || dev->towrite || dev->written ||
347            test_bit(R5_LOCKED, &dev->flags)) {
348            printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
349                   (unsigned long long)sh->sector, i, dev->toread,
350                   dev->read, dev->towrite, dev->written,
351                   test_bit(R5_LOCKED, &dev->flags));
352            WARN_ON(1);
353        }
354        dev->flags = 0;
355        raid5_build_block(sh, i, previous);
356    }
357    insert_hash(conf, sh);
358}
359
360static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
361                     short generation)
362{
363    struct stripe_head *sh;
364    struct hlist_node *hn;
365
366    pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
367    hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
368        if (sh->sector == sector && sh->generation == generation)
369            return sh;
370    pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
371    return NULL;
372}
373
374/*
375 * Need to check if array has failed when deciding whether to:
376 * - start an array
377 * - remove non-faulty devices
378 * - add a spare
379 * - allow a reshape
380 * This determination is simple when no reshape is happening.
381 * However if there is a reshape, we need to carefully check
382 * both the before and after sections.
383 * This is because some failed devices may only affect one
384 * of the two sections, and some non-in_sync devices may
385 * be insync in the section most affected by failed devices.
386 */
387static int calc_degraded(struct r5conf *conf)
388{
389    int degraded, degraded2;
390    int i;
391
392    rcu_read_lock();
393    degraded = 0;
394    for (i = 0; i < conf->previous_raid_disks; i++) {
395        struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
396        if (rdev && test_bit(Faulty, &rdev->flags))
397            rdev = rcu_dereference(conf->disks[i].replacement);
398        if (!rdev || test_bit(Faulty, &rdev->flags))
399            degraded++;
400        else if (test_bit(In_sync, &rdev->flags))
401            ;
402        else
403            /* not in-sync or faulty.
404             * If the reshape increases the number of devices,
405             * this is being recovered by the reshape, so
406             * this 'previous' section is not in_sync.
407             * If the number of devices is being reduced however,
408             * the device can only be part of the array if
409             * we are reverting a reshape, so this section will
410             * be in-sync.
411             */
412            if (conf->raid_disks >= conf->previous_raid_disks)
413                degraded++;
414    }
415    rcu_read_unlock();
416    if (conf->raid_disks == conf->previous_raid_disks)
417        return degraded;
418    rcu_read_lock();
419    degraded2 = 0;
420    for (i = 0; i < conf->raid_disks; i++) {
421        struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
422        if (rdev && test_bit(Faulty, &rdev->flags))
423            rdev = rcu_dereference(conf->disks[i].replacement);
424        if (!rdev || test_bit(Faulty, &rdev->flags))
425            degraded2++;
426        else if (test_bit(In_sync, &rdev->flags))
427            ;
428        else
429            /* not in-sync or faulty.
430             * If reshape increases the number of devices, this
431             * section has already been recovered, else it
432             * almost certainly hasn't.
433             */
434            if (conf->raid_disks <= conf->previous_raid_disks)
435                degraded2++;
436    }
437    rcu_read_unlock();
438    if (degraded2 > degraded)
439        return degraded2;
440    return degraded;
441}
442
443static int has_failed(struct r5conf *conf)
444{
445    int degraded;
446
447    if (conf->mddev->reshape_position == MaxSector)
448        return conf->mddev->degraded > conf->max_degraded;
449
450    degraded = calc_degraded(conf);
451    if (degraded > conf->max_degraded)
452        return 1;
453    return 0;
454}
455
456static struct stripe_head *
457get_active_stripe(struct r5conf *conf, sector_t sector,
458          int previous, int noblock, int noquiesce)
459{
460    struct stripe_head *sh;
461
462    pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
463
464    spin_lock_irq(&conf->device_lock);
465
466    do {
467        wait_event_lock_irq(conf->wait_for_stripe,
468                    conf->quiesce == 0 || noquiesce,
469                    conf->device_lock, /* nothing */);
470        sh = __find_stripe(conf, sector, conf->generation - previous);
471        if (!sh) {
472            if (!conf->inactive_blocked)
473                sh = get_free_stripe(conf);
474            if (noblock && sh == NULL)
475                break;
476            if (!sh) {
477                conf->inactive_blocked = 1;
478                wait_event_lock_irq(conf->wait_for_stripe,
479                            !list_empty(&conf->inactive_list) &&
480                            (atomic_read(&conf->active_stripes)
481                             < (conf->max_nr_stripes *3/4)
482                             || !conf->inactive_blocked),
483                            conf->device_lock,
484                            );
485                conf->inactive_blocked = 0;
486            } else
487                init_stripe(sh, sector, previous);
488        } else {
489            if (atomic_read(&sh->count)) {
490                BUG_ON(!list_empty(&sh->lru)
491                    && !test_bit(STRIPE_EXPANDING, &sh->state)
492                    && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state));
493            } else {
494                if (!test_bit(STRIPE_HANDLE, &sh->state))
495                    atomic_inc(&conf->active_stripes);
496                if (list_empty(&sh->lru) &&
497                    !test_bit(STRIPE_EXPANDING, &sh->state))
498                    BUG();
499                list_del_init(&sh->lru);
500            }
501        }
502    } while (sh == NULL);
503
504    if (sh)
505        atomic_inc(&sh->count);
506
507    spin_unlock_irq(&conf->device_lock);
508    return sh;
509}
510
511/* Determine if 'data_offset' or 'new_data_offset' should be used
512 * in this stripe_head.
513 */
514static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
515{
516    sector_t progress = conf->reshape_progress;
517    /* Need a memory barrier to make sure we see the value
518     * of conf->generation, or ->data_offset that was set before
519     * reshape_progress was updated.
520     */
521    smp_rmb();
522    if (progress == MaxSector)
523        return 0;
524    if (sh->generation == conf->generation - 1)
525        return 0;
526    /* We are in a reshape, and this is a new-generation stripe,
527     * so use new_data_offset.
528     */
529    return 1;
530}
531
532static void
533raid5_end_read_request(struct bio *bi, int error);
534static void
535raid5_end_write_request(struct bio *bi, int error);
536
537static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
538{
539    struct r5conf *conf = sh->raid_conf;
540    int i, disks = sh->disks;
541
542    might_sleep();
543
544    for (i = disks; i--; ) {
545        int rw;
546        int replace_only = 0;
547        struct bio *bi, *rbi;
548        struct md_rdev *rdev, *rrdev = NULL;
549        if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
550            if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
551                rw = WRITE_FUA;
552            else
553                rw = WRITE;
554        } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
555            rw = READ;
556        else if (test_and_clear_bit(R5_WantReplace,
557                        &sh->dev[i].flags)) {
558            rw = WRITE;
559            replace_only = 1;
560        } else
561            continue;
562        if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
563            rw |= REQ_SYNC;
564
565        bi = &sh->dev[i].req;
566        rbi = &sh->dev[i].rreq; /* For writing to replacement */
567
568        bi->bi_rw = rw;
569        rbi->bi_rw = rw;
570        if (rw & WRITE) {
571            bi->bi_end_io = raid5_end_write_request;
572            rbi->bi_end_io = raid5_end_write_request;
573        } else
574            bi->bi_end_io = raid5_end_read_request;
575
576        rcu_read_lock();
577        rrdev = rcu_dereference(conf->disks[i].replacement);
578        smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
579        rdev = rcu_dereference(conf->disks[i].rdev);
580        if (!rdev) {
581            rdev = rrdev;
582            rrdev = NULL;
583        }
584        if (rw & WRITE) {
585            if (replace_only)
586                rdev = NULL;
587            if (rdev == rrdev)
588                /* We raced and saw duplicates */
589                rrdev = NULL;
590        } else {
591            if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
592                rdev = rrdev;
593            rrdev = NULL;
594        }
595
596        if (rdev && test_bit(Faulty, &rdev->flags))
597            rdev = NULL;
598        if (rdev)
599            atomic_inc(&rdev->nr_pending);
600        if (rrdev && test_bit(Faulty, &rrdev->flags))
601            rrdev = NULL;
602        if (rrdev)
603            atomic_inc(&rrdev->nr_pending);
604        rcu_read_unlock();
605
606        /* We have already checked bad blocks for reads. Now
607         * need to check for writes. We never accept write errors
608         * on the replacement, so we don't to check rrdev.
609         */
610        while ((rw & WRITE) && rdev &&
611               test_bit(WriteErrorSeen, &rdev->flags)) {
612            sector_t first_bad;
613            int bad_sectors;
614            int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
615                          &first_bad, &bad_sectors);
616            if (!bad)
617                break;
618
619            if (bad < 0) {
620                set_bit(BlockedBadBlocks, &rdev->flags);
621                if (!conf->mddev->external &&
622                    conf->mddev->flags) {
623                    /* It is very unlikely, but we might
624                     * still need to write out the
625                     * bad block log - better give it
626                     * a chance*/
627                    md_check_recovery(conf->mddev);
628                }
629                /*
630                 * Because md_wait_for_blocked_rdev
631                 * will dec nr_pending, we must
632                 * increment it first.
633                 */
634                atomic_inc(&rdev->nr_pending);
635                md_wait_for_blocked_rdev(rdev, conf->mddev);
636            } else {
637                /* Acknowledged bad block - skip the write */
638                rdev_dec_pending(rdev, conf->mddev);
639                rdev = NULL;
640            }
641        }
642
643        if (rdev) {
644            if (s->syncing || s->expanding || s->expanded
645                || s->replacing)
646                md_sync_acct(rdev->bdev, STRIPE_SECTORS);
647
648            set_bit(STRIPE_IO_STARTED, &sh->state);
649
650            bi->bi_bdev = rdev->bdev;
651            pr_debug("%s: for %llu schedule op %ld on disc %d\n",
652                __func__, (unsigned long long)sh->sector,
653                bi->bi_rw, i);
654            atomic_inc(&sh->count);
655            if (use_new_offset(conf, sh))
656                bi->bi_sector = (sh->sector
657                         + rdev->new_data_offset);
658            else
659                bi->bi_sector = (sh->sector
660                         + rdev->data_offset);
661            if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
662                bi->bi_rw |= REQ_FLUSH;
663
664            bi->bi_flags = 1 << BIO_UPTODATE;
665            bi->bi_idx = 0;
666            bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
667            bi->bi_io_vec[0].bv_offset = 0;
668            bi->bi_size = STRIPE_SIZE;
669            bi->bi_next = NULL;
670            if (rrdev)
671                set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
672            generic_make_request(bi);
673        }
674        if (rrdev) {
675            if (s->syncing || s->expanding || s->expanded
676                || s->replacing)
677                md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
678
679            set_bit(STRIPE_IO_STARTED, &sh->state);
680
681            rbi->bi_bdev = rrdev->bdev;
682            pr_debug("%s: for %llu schedule op %ld on "
683                 "replacement disc %d\n",
684                __func__, (unsigned long long)sh->sector,
685                rbi->bi_rw, i);
686            atomic_inc(&sh->count);
687            if (use_new_offset(conf, sh))
688                rbi->bi_sector = (sh->sector
689                          + rrdev->new_data_offset);
690            else
691                rbi->bi_sector = (sh->sector
692                          + rrdev->data_offset);
693            rbi->bi_flags = 1 << BIO_UPTODATE;
694            rbi->bi_idx = 0;
695            rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
696            rbi->bi_io_vec[0].bv_offset = 0;
697            rbi->bi_size = STRIPE_SIZE;
698            rbi->bi_next = NULL;
699            generic_make_request(rbi);
700        }
701        if (!rdev && !rrdev) {
702            if (rw & WRITE)
703                set_bit(STRIPE_DEGRADED, &sh->state);
704            pr_debug("skip op %ld on disc %d for sector %llu\n",
705                bi->bi_rw, i, (unsigned long long)sh->sector);
706            clear_bit(R5_LOCKED, &sh->dev[i].flags);
707            set_bit(STRIPE_HANDLE, &sh->state);
708        }
709    }
710}
711
712static struct dma_async_tx_descriptor *
713async_copy_data(int frombio, struct bio *bio, struct page *page,
714    sector_t sector, struct dma_async_tx_descriptor *tx)
715{
716    struct bio_vec *bvl;
717    struct page *bio_page;
718    int i;
719    int page_offset;
720    struct async_submit_ctl submit;
721    enum async_tx_flags flags = 0;
722
723    if (bio->bi_sector >= sector)
724        page_offset = (signed)(bio->bi_sector - sector) * 512;
725    else
726        page_offset = (signed)(sector - bio->bi_sector) * -512;
727
728    if (frombio)
729        flags |= ASYNC_TX_FENCE;
730    init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
731
732    bio_for_each_segment(bvl, bio, i) {
733        int len = bvl->bv_len;
734        int clen;
735        int b_offset = 0;
736
737        if (page_offset < 0) {
738            b_offset = -page_offset;
739            page_offset += b_offset;
740            len -= b_offset;
741        }
742
743        if (len > 0 && page_offset + len > STRIPE_SIZE)
744            clen = STRIPE_SIZE - page_offset;
745        else
746            clen = len;
747
748        if (clen > 0) {
749            b_offset += bvl->bv_offset;
750            bio_page = bvl->bv_page;
751            if (frombio)
752                tx = async_memcpy(page, bio_page, page_offset,
753                          b_offset, clen, &submit);
754            else
755                tx = async_memcpy(bio_page, page, b_offset,
756                          page_offset, clen, &submit);
757        }
758        /* chain the operations */
759        submit.depend_tx = tx;
760
761        if (clen < len) /* hit end of page */
762            break;
763        page_offset += len;
764    }
765
766    return tx;
767}
768
769static void ops_complete_biofill(void *stripe_head_ref)
770{
771    struct stripe_head *sh = stripe_head_ref;
772    struct bio *return_bi = NULL;
773    int i;
774
775    pr_debug("%s: stripe %llu\n", __func__,
776        (unsigned long long)sh->sector);
777
778    /* clear completed biofills */
779    for (i = sh->disks; i--; ) {
780        struct r5dev *dev = &sh->dev[i];
781
782        /* acknowledge completion of a biofill operation */
783        /* and check if we need to reply to a read request,
784         * new R5_Wantfill requests are held off until
785         * !STRIPE_BIOFILL_RUN
786         */
787        if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
788            struct bio *rbi, *rbi2;
789
790            BUG_ON(!dev->read);
791            rbi = dev->read;
792            dev->read = NULL;
793            while (rbi && rbi->bi_sector <
794                dev->sector + STRIPE_SECTORS) {
795                rbi2 = r5_next_bio(rbi, dev->sector);
796                if (!raid5_dec_bi_active_stripes(rbi)) {
797                    rbi->bi_next = return_bi;
798                    return_bi = rbi;
799                }
800                rbi = rbi2;
801            }
802        }
803    }
804    clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
805
806    return_io(return_bi);
807
808    set_bit(STRIPE_HANDLE, &sh->state);
809    release_stripe(sh);
810}
811
812static void ops_run_biofill(struct stripe_head *sh)
813{
814    struct dma_async_tx_descriptor *tx = NULL;
815    struct async_submit_ctl submit;
816    int i;
817
818    pr_debug("%s: stripe %llu\n", __func__,
819        (unsigned long long)sh->sector);
820
821    for (i = sh->disks; i--; ) {
822        struct r5dev *dev = &sh->dev[i];
823        if (test_bit(R5_Wantfill, &dev->flags)) {
824            struct bio *rbi;
825            spin_lock_irq(&sh->stripe_lock);
826            dev->read = rbi = dev->toread;
827            dev->toread = NULL;
828            spin_unlock_irq(&sh->stripe_lock);
829            while (rbi && rbi->bi_sector <
830                dev->sector + STRIPE_SECTORS) {
831                tx = async_copy_data(0, rbi, dev->page,
832                    dev->sector, tx);
833                rbi = r5_next_bio(rbi, dev->sector);
834            }
835        }
836    }
837
838    atomic_inc(&sh->count);
839    init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
840    async_trigger_callback(&submit);
841}
842
843static void mark_target_uptodate(struct stripe_head *sh, int target)
844{
845    struct r5dev *tgt;
846
847    if (target < 0)
848        return;
849
850    tgt = &sh->dev[target];
851    set_bit(R5_UPTODATE, &tgt->flags);
852    BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
853    clear_bit(R5_Wantcompute, &tgt->flags);
854}
855
856static void ops_complete_compute(void *stripe_head_ref)
857{
858    struct stripe_head *sh = stripe_head_ref;
859
860    pr_debug("%s: stripe %llu\n", __func__,
861        (unsigned long long)sh->sector);
862
863    /* mark the computed target(s) as uptodate */
864    mark_target_uptodate(sh, sh->ops.target);
865    mark_target_uptodate(sh, sh->ops.target2);
866
867    clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
868    if (sh->check_state == check_state_compute_run)
869        sh->check_state = check_state_compute_result;
870    set_bit(STRIPE_HANDLE, &sh->state);
871    release_stripe(sh);
872}
873
874/* return a pointer to the address conversion region of the scribble buffer */
875static addr_conv_t *to_addr_conv(struct stripe_head *sh,
876                 struct raid5_percpu *percpu)
877{
878    return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
879}
880
881static struct dma_async_tx_descriptor *
882ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
883{
884    int disks = sh->disks;
885    struct page **xor_srcs = percpu->scribble;
886    int target = sh->ops.target;
887    struct r5dev *tgt = &sh->dev[target];
888    struct page *xor_dest = tgt->page;
889    int count = 0;
890    struct dma_async_tx_descriptor *tx;
891    struct async_submit_ctl submit;
892    int i;
893
894    pr_debug("%s: stripe %llu block: %d\n",
895        __func__, (unsigned long long)sh->sector, target);
896    BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
897
898    for (i = disks; i--; )
899        if (i != target)
900            xor_srcs[count++] = sh->dev[i].page;
901
902    atomic_inc(&sh->count);
903
904    init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
905              ops_complete_compute, sh, to_addr_conv(sh, percpu));
906    if (unlikely(count == 1))
907        tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
908    else
909        tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
910
911    return tx;
912}
913
914/* set_syndrome_sources - populate source buffers for gen_syndrome
915 * @srcs - (struct page *) array of size sh->disks
916 * @sh - stripe_head to parse
917 *
918 * Populates srcs in proper layout order for the stripe and returns the
919 * 'count' of sources to be used in a call to async_gen_syndrome. The P
920 * destination buffer is recorded in srcs[count] and the Q destination
921 * is recorded in srcs[count+1]].
922 */
923static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
924{
925    int disks = sh->disks;
926    int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
927    int d0_idx = raid6_d0(sh);
928    int count;
929    int i;
930
931    for (i = 0; i < disks; i++)
932        srcs[i] = NULL;
933
934    count = 0;
935    i = d0_idx;
936    do {
937        int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
938
939        srcs[slot] = sh->dev[i].page;
940        i = raid6_next_disk(i, disks);
941    } while (i != d0_idx);
942
943    return syndrome_disks;
944}
945
946static struct dma_async_tx_descriptor *
947ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
948{
949    int disks = sh->disks;
950    struct page **blocks = percpu->scribble;
951    int target;
952    int qd_idx = sh->qd_idx;
953    struct dma_async_tx_descriptor *tx;
954    struct async_submit_ctl submit;
955    struct r5dev *tgt;
956    struct page *dest;
957    int i;
958    int count;
959
960    if (sh->ops.target < 0)
961        target = sh->ops.target2;
962    else if (sh->ops.target2 < 0)
963        target = sh->ops.target;
964    else
965        /* we should only have one valid target */
966        BUG();
967    BUG_ON(target < 0);
968    pr_debug("%s: stripe %llu block: %d\n",
969        __func__, (unsigned long long)sh->sector, target);
970
971    tgt = &sh->dev[target];
972    BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
973    dest = tgt->page;
974
975    atomic_inc(&sh->count);
976
977    if (target == qd_idx) {
978        count = set_syndrome_sources(blocks, sh);
979        blocks[count] = NULL; /* regenerating p is not necessary */
980        BUG_ON(blocks[count+1] != dest); /* q should already be set */
981        init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
982                  ops_complete_compute, sh,
983                  to_addr_conv(sh, percpu));
984        tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
985    } else {
986        /* Compute any data- or p-drive using XOR */
987        count = 0;
988        for (i = disks; i-- ; ) {
989            if (i == target || i == qd_idx)
990                continue;
991            blocks[count++] = sh->dev[i].page;
992        }
993
994        init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
995                  NULL, ops_complete_compute, sh,
996                  to_addr_conv(sh, percpu));
997        tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
998    }
999
1000    return tx;
1001}
1002
1003static struct dma_async_tx_descriptor *
1004ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1005{
1006    int i, count, disks = sh->disks;
1007    int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1008    int d0_idx = raid6_d0(sh);
1009    int faila = -1, failb = -1;
1010    int target = sh->ops.target;
1011    int target2 = sh->ops.target2;
1012    struct r5dev *tgt = &sh->dev[target];
1013    struct r5dev *tgt2 = &sh->dev[target2];
1014    struct dma_async_tx_descriptor *tx;
1015    struct page **blocks = percpu->scribble;
1016    struct async_submit_ctl submit;
1017
1018    pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1019         __func__, (unsigned long long)sh->sector, target, target2);
1020    BUG_ON(target < 0 || target2 < 0);
1021    BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1022    BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1023
1024    /* we need to open-code set_syndrome_sources to handle the
1025     * slot number conversion for 'faila' and 'failb'
1026     */
1027    for (i = 0; i < disks ; i++)
1028        blocks[i] = NULL;
1029    count = 0;
1030    i = d0_idx;
1031    do {
1032        int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1033
1034        blocks[slot] = sh->dev[i].page;
1035
1036        if (i == target)
1037            faila = slot;
1038        if (i == target2)
1039            failb = slot;
1040        i = raid6_next_disk(i, disks);
1041    } while (i != d0_idx);
1042
1043    BUG_ON(faila == failb);
1044    if (failb < faila)
1045        swap(faila, failb);
1046    pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1047         __func__, (unsigned long long)sh->sector, faila, failb);
1048
1049    atomic_inc(&sh->count);
1050
1051    if (failb == syndrome_disks+1) {
1052        /* Q disk is one of the missing disks */
1053        if (faila == syndrome_disks) {
1054            /* Missing P+Q, just recompute */
1055            init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1056                      ops_complete_compute, sh,
1057                      to_addr_conv(sh, percpu));
1058            return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1059                          STRIPE_SIZE, &submit);
1060        } else {
1061            struct page *dest;
1062            int data_target;
1063            int qd_idx = sh->qd_idx;
1064
1065            /* Missing D+Q: recompute D from P, then recompute Q */
1066            if (target == qd_idx)
1067                data_target = target2;
1068            else
1069                data_target = target;
1070
1071            count = 0;
1072            for (i = disks; i-- ; ) {
1073                if (i == data_target || i == qd_idx)
1074                    continue;
1075                blocks[count++] = sh->dev[i].page;
1076            }
1077            dest = sh->dev[data_target].page;
1078            init_async_submit(&submit,
1079                      ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1080                      NULL, NULL, NULL,
1081                      to_addr_conv(sh, percpu));
1082            tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1083                       &submit);
1084
1085            count = set_syndrome_sources(blocks, sh);
1086            init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1087                      ops_complete_compute, sh,
1088                      to_addr_conv(sh, percpu));
1089            return async_gen_syndrome(blocks, 0, count+2,
1090                          STRIPE_SIZE, &submit);
1091        }
1092    } else {
1093        init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1094                  ops_complete_compute, sh,
1095                  to_addr_conv(sh, percpu));
1096        if (failb == syndrome_disks) {
1097            /* We're missing D+P. */
1098            return async_raid6_datap_recov(syndrome_disks+2,
1099                               STRIPE_SIZE, faila,
1100                               blocks, &submit);
1101        } else {
1102            /* We're missing D+D. */
1103            return async_raid6_2data_recov(syndrome_disks+2,
1104                               STRIPE_SIZE, faila, failb,
1105                               blocks, &submit);
1106        }
1107    }
1108}
1109
1110
1111static void ops_complete_prexor(void *stripe_head_ref)
1112{
1113    struct stripe_head *sh = stripe_head_ref;
1114
1115    pr_debug("%s: stripe %llu\n", __func__,
1116        (unsigned long long)sh->sector);
1117}
1118
1119static struct dma_async_tx_descriptor *
1120ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
1121           struct dma_async_tx_descriptor *tx)
1122{
1123    int disks = sh->disks;
1124    struct page **xor_srcs = percpu->scribble;
1125    int count = 0, pd_idx = sh->pd_idx, i;
1126    struct async_submit_ctl submit;
1127
1128    /* existing parity data subtracted */
1129    struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1130
1131    pr_debug("%s: stripe %llu\n", __func__,
1132        (unsigned long long)sh->sector);
1133
1134    for (i = disks; i--; ) {
1135        struct r5dev *dev = &sh->dev[i];
1136        /* Only process blocks that are known to be uptodate */
1137        if (test_bit(R5_Wantdrain, &dev->flags))
1138            xor_srcs[count++] = dev->page;
1139    }
1140
1141    init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1142              ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1143    tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1144
1145    return tx;
1146}
1147
1148static struct dma_async_tx_descriptor *
1149ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1150{
1151    int disks = sh->disks;
1152    int i;
1153
1154    pr_debug("%s: stripe %llu\n", __func__,
1155        (unsigned long long)sh->sector);
1156
1157    for (i = disks; i--; ) {
1158        struct r5dev *dev = &sh->dev[i];
1159        struct bio *chosen;
1160
1161        if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1162            struct bio *wbi;
1163
1164            spin_lock_irq(&sh->stripe_lock);
1165            chosen = dev->towrite;
1166            dev->towrite = NULL;
1167            BUG_ON(dev->written);
1168            wbi = dev->written = chosen;
1169            spin_unlock_irq(&sh->stripe_lock);
1170
1171            while (wbi && wbi->bi_sector <
1172                dev->sector + STRIPE_SECTORS) {
1173                if (wbi->bi_rw & REQ_FUA)
1174                    set_bit(R5_WantFUA, &dev->flags);
1175                if (wbi->bi_rw & REQ_SYNC)
1176                    set_bit(R5_SyncIO, &dev->flags);
1177                tx = async_copy_data(1, wbi, dev->page,
1178                    dev->sector, tx);
1179                wbi = r5_next_bio(wbi, dev->sector);
1180            }
1181        }
1182    }
1183
1184    return tx;
1185}
1186
1187static void ops_complete_reconstruct(void *stripe_head_ref)
1188{
1189    struct stripe_head *sh = stripe_head_ref;
1190    int disks = sh->disks;
1191    int pd_idx = sh->pd_idx;
1192    int qd_idx = sh->qd_idx;
1193    int i;
1194    bool fua = false, sync = false;
1195
1196    pr_debug("%s: stripe %llu\n", __func__,
1197        (unsigned long long)sh->sector);
1198
1199    for (i = disks; i--; ) {
1200        fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1201        sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1202    }
1203
1204    for (i = disks; i--; ) {
1205        struct r5dev *dev = &sh->dev[i];
1206
1207        if (dev->written || i == pd_idx || i == qd_idx) {
1208            set_bit(R5_UPTODATE, &dev->flags);
1209            if (fua)
1210                set_bit(R5_WantFUA, &dev->flags);
1211            if (sync)
1212                set_bit(R5_SyncIO, &dev->flags);
1213        }
1214    }
1215
1216    if (sh->reconstruct_state == reconstruct_state_drain_run)
1217        sh->reconstruct_state = reconstruct_state_drain_result;
1218    else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1219        sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1220    else {
1221        BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1222        sh->reconstruct_state = reconstruct_state_result;
1223    }
1224
1225    set_bit(STRIPE_HANDLE, &sh->state);
1226    release_stripe(sh);
1227}
1228
1229static void
1230ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1231             struct dma_async_tx_descriptor *tx)
1232{
1233    int disks = sh->disks;
1234    struct page **xor_srcs = percpu->scribble;
1235    struct async_submit_ctl submit;
1236    int count = 0, pd_idx = sh->pd_idx, i;
1237    struct page *xor_dest;
1238    int prexor = 0;
1239    unsigned long flags;
1240
1241    pr_debug("%s: stripe %llu\n", __func__,
1242        (unsigned long long)sh->sector);
1243
1244    /* check if prexor is active which means only process blocks
1245     * that are part of a read-modify-write (written)
1246     */
1247    if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1248        prexor = 1;
1249        xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1250        for (i = disks; i--; ) {
1251            struct r5dev *dev = &sh->dev[i];
1252            if (dev->written)
1253                xor_srcs[count++] = dev->page;
1254        }
1255    } else {
1256        xor_dest = sh->dev[pd_idx].page;
1257        for (i = disks; i--; ) {
1258            struct r5dev *dev = &sh->dev[i];
1259            if (i != pd_idx)
1260                xor_srcs[count++] = dev->page;
1261        }
1262    }
1263
1264    /* 1/ if we prexor'd then the dest is reused as a source
1265     * 2/ if we did not prexor then we are redoing the parity
1266     * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1267     * for the synchronous xor case
1268     */
1269    flags = ASYNC_TX_ACK |
1270        (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1271
1272    atomic_inc(&sh->count);
1273
1274    init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1275              to_addr_conv(sh, percpu));
1276    if (unlikely(count == 1))
1277        tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1278    else
1279        tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1280}
1281
1282static void
1283ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1284             struct dma_async_tx_descriptor *tx)
1285{
1286    struct async_submit_ctl submit;
1287    struct page **blocks = percpu->scribble;
1288    int count;
1289
1290    pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1291
1292    count = set_syndrome_sources(blocks, sh);
1293
1294    atomic_inc(&sh->count);
1295
1296    init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1297              sh, to_addr_conv(sh, percpu));
1298    async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1299}
1300
1301static void ops_complete_check(void *stripe_head_ref)
1302{
1303    struct stripe_head *sh = stripe_head_ref;
1304
1305    pr_debug("%s: stripe %llu\n", __func__,
1306        (unsigned long long)sh->sector);
1307
1308    sh->check_state = check_state_check_result;
1309    set_bit(STRIPE_HANDLE, &sh->state);
1310    release_stripe(sh);
1311}
1312
1313static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1314{
1315    int disks = sh->disks;
1316    int pd_idx = sh->pd_idx;
1317    int qd_idx = sh->qd_idx;
1318    struct page *xor_dest;
1319    struct page **xor_srcs = percpu->scribble;
1320    struct dma_async_tx_descriptor *tx;
1321    struct async_submit_ctl submit;
1322    int count;
1323    int i;
1324
1325    pr_debug("%s: stripe %llu\n", __func__,
1326        (unsigned long long)sh->sector);
1327
1328    count = 0;
1329    xor_dest = sh->dev[pd_idx].page;
1330    xor_srcs[count++] = xor_dest;
1331    for (i = disks; i--; ) {
1332        if (i == pd_idx || i == qd_idx)
1333            continue;
1334        xor_srcs[count++] = sh->dev[i].page;
1335    }
1336
1337    init_async_submit(&submit, 0, NULL, NULL, NULL,
1338              to_addr_conv(sh, percpu));
1339    tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1340               &sh->ops.zero_sum_result, &submit);
1341
1342    atomic_inc(&sh->count);
1343    init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1344    tx = async_trigger_callback(&submit);
1345}
1346
1347static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1348{
1349    struct page **srcs = percpu->scribble;
1350    struct async_submit_ctl submit;
1351    int count;
1352
1353    pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1354        (unsigned long long)sh->sector, checkp);
1355
1356    count = set_syndrome_sources(srcs, sh);
1357    if (!checkp)
1358        srcs[count] = NULL;
1359
1360    atomic_inc(&sh->count);
1361    init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1362              sh, to_addr_conv(sh, percpu));
1363    async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1364               &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1365}
1366
1367static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1368{
1369    int overlap_clear = 0, i, disks = sh->disks;
1370    struct dma_async_tx_descriptor *tx = NULL;
1371    struct r5conf *conf = sh->raid_conf;
1372    int level = conf->level;
1373    struct raid5_percpu *percpu;
1374    unsigned long cpu;
1375
1376    cpu = get_cpu();
1377    percpu = per_cpu_ptr(conf->percpu, cpu);
1378    if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1379        ops_run_biofill(sh);
1380        overlap_clear++;
1381    }
1382
1383    if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1384        if (level < 6)
1385            tx = ops_run_compute5(sh, percpu);
1386        else {
1387            if (sh->ops.target2 < 0 || sh->ops.target < 0)
1388                tx = ops_run_compute6_1(sh, percpu);
1389            else
1390                tx = ops_run_compute6_2(sh, percpu);
1391        }
1392        /* terminate the chain if reconstruct is not set to be run */
1393        if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1394            async_tx_ack(tx);
1395    }
1396
1397    if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1398        tx = ops_run_prexor(sh, percpu, tx);
1399
1400    if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1401        tx = ops_run_biodrain(sh, tx);
1402        overlap_clear++;
1403    }
1404
1405    if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1406        if (level < 6)
1407            ops_run_reconstruct5(sh, percpu, tx);
1408        else
1409            ops_run_reconstruct6(sh, percpu, tx);
1410    }
1411
1412    if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1413        if (sh->check_state == check_state_run)
1414            ops_run_check_p(sh, percpu);
1415        else if (sh->check_state == check_state_run_q)
1416            ops_run_check_pq(sh, percpu, 0);
1417        else if (sh->check_state == check_state_run_pq)
1418            ops_run_check_pq(sh, percpu, 1);
1419        else
1420            BUG();
1421    }
1422
1423    if (overlap_clear)
1424        for (i = disks; i--; ) {
1425            struct r5dev *dev = &sh->dev[i];
1426            if (test_and_clear_bit(R5_Overlap, &dev->flags))
1427                wake_up(&sh->raid_conf->wait_for_overlap);
1428        }
1429    put_cpu();
1430}
1431
1432#ifdef CONFIG_MULTICORE_RAID456
1433static void async_run_ops(void *param, async_cookie_t cookie)
1434{
1435    struct stripe_head *sh = param;
1436    unsigned long ops_request = sh->ops.request;
1437
1438    clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1439    wake_up(&sh->ops.wait_for_ops);
1440
1441    __raid_run_ops(sh, ops_request);
1442    release_stripe(sh);
1443}
1444
1445static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1446{
1447    /* since handle_stripe can be called outside of raid5d context
1448     * we need to ensure sh->ops.request is de-staged before another
1449     * request arrives
1450     */
1451    wait_event(sh->ops.wait_for_ops,
1452           !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1453    sh->ops.request = ops_request;
1454
1455    atomic_inc(&sh->count);
1456    async_schedule(async_run_ops, sh);
1457}
1458#else
1459#define raid_run_ops __raid_run_ops
1460#endif
1461
1462static int grow_one_stripe(struct r5conf *conf)
1463{
1464    struct stripe_head *sh;
1465    sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1466    if (!sh)
1467        return 0;
1468
1469    sh->raid_conf = conf;
1470    #ifdef CONFIG_MULTICORE_RAID456
1471    init_waitqueue_head(&sh->ops.wait_for_ops);
1472    #endif
1473
1474    spin_lock_init(&sh->stripe_lock);
1475
1476    if (grow_buffers(sh)) {
1477        shrink_buffers(sh);
1478        kmem_cache_free(conf->slab_cache, sh);
1479        return 0;
1480    }
1481    /* we just created an active stripe so... */
1482    atomic_set(&sh->count, 1);
1483    atomic_inc(&conf->active_stripes);
1484    INIT_LIST_HEAD(&sh->lru);
1485    release_stripe(sh);
1486    return 1;
1487}
1488
1489static int grow_stripes(struct r5conf *conf, int num)
1490{
1491    struct kmem_cache *sc;
1492    int devs = max(conf->raid_disks, conf->previous_raid_disks);
1493
1494    if (conf->mddev->gendisk)
1495        sprintf(conf->cache_name[0],
1496            "raid%d-%s", conf->level, mdname(conf->mddev));
1497    else
1498        sprintf(conf->cache_name[0],
1499            "raid%d-%p", conf->level, conf->mddev);
1500    sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
1501
1502    conf->active_name = 0;
1503    sc = kmem_cache_create(conf->cache_name[conf->active_name],
1504                   sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1505                   0, 0, NULL);
1506    if (!sc)
1507        return 1;
1508    conf->slab_cache = sc;
1509    conf->pool_size = devs;
1510    while (num--)
1511        if (!grow_one_stripe(conf))
1512            return 1;
1513    return 0;
1514}
1515
1516/**
1517 * scribble_len - return the required size of the scribble region
1518 * @num - total number of disks in the array
1519 *
1520 * The size must be enough to contain:
1521 * 1/ a struct page pointer for each device in the array +2
1522 * 2/ room to convert each entry in (1) to its corresponding dma
1523 * (dma_map_page()) or page (page_address()) address.
1524 *
1525 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1526 * calculate over all devices (not just the data blocks), using zeros in place
1527 * of the P and Q blocks.
1528 */
1529static size_t scribble_len(int num)
1530{
1531    size_t len;
1532
1533    len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1534
1535    return len;
1536}
1537
1538static int resize_stripes(struct r5conf *conf, int newsize)
1539{
1540    /* Make all the stripes able to hold 'newsize' devices.
1541     * New slots in each stripe get 'page' set to a new page.
1542     *
1543     * This happens in stages:
1544     * 1/ create a new kmem_cache and allocate the required number of
1545     * stripe_heads.
1546     * 2/ gather all the old stripe_heads and tranfer the pages across
1547     * to the new stripe_heads. This will have the side effect of
1548     * freezing the array as once all stripe_heads have been collected,
1549     * no IO will be possible. Old stripe heads are freed once their
1550     * pages have been transferred over, and the old kmem_cache is
1551     * freed when all stripes are done.
1552     * 3/ reallocate conf->disks to be suitable bigger. If this fails,
1553     * we simple return a failre status - no need to clean anything up.
1554     * 4/ allocate new pages for the new slots in the new stripe_heads.
1555     * If this fails, we don't bother trying the shrink the
1556     * stripe_heads down again, we just leave them as they are.
1557     * As each stripe_head is processed the new one is released into
1558     * active service.
1559     *
1560     * Once step2 is started, we cannot afford to wait for a write,
1561     * so we use GFP_NOIO allocations.
1562     */
1563    struct stripe_head *osh, *nsh;
1564    LIST_HEAD(newstripes);
1565    struct disk_info *ndisks;
1566    unsigned long cpu;
1567    int err;
1568    struct kmem_cache *sc;
1569    int i;
1570
1571    if (newsize <= conf->pool_size)
1572        return 0; /* never bother to shrink */
1573
1574    err = md_allow_write(conf->mddev);
1575    if (err)
1576        return err;
1577
1578    /* Step 1 */
1579    sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1580                   sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1581                   0, 0, NULL);
1582    if (!sc)
1583        return -ENOMEM;
1584
1585    for (i = conf->max_nr_stripes; i; i--) {
1586        nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1587        if (!nsh)
1588            break;
1589
1590        nsh->raid_conf = conf;
1591        #ifdef CONFIG_MULTICORE_RAID456
1592        init_waitqueue_head(&nsh->ops.wait_for_ops);
1593        #endif
1594        spin_lock_init(&nsh->stripe_lock);
1595
1596        list_add(&nsh->lru, &newstripes);
1597    }
1598    if (i) {
1599        /* didn't get enough, give up */
1600        while (!list_empty(&newstripes)) {
1601            nsh = list_entry(newstripes.next, struct stripe_head, lru);
1602            list_del(&nsh->lru);
1603            kmem_cache_free(sc, nsh);
1604        }
1605        kmem_cache_destroy(sc);
1606        return -ENOMEM;
1607    }
1608    /* Step 2 - Must use GFP_NOIO now.
1609     * OK, we have enough stripes, start collecting inactive
1610     * stripes and copying them over
1611     */
1612    list_for_each_entry(nsh, &newstripes, lru) {
1613        spin_lock_irq(&conf->device_lock);
1614        wait_event_lock_irq(conf->wait_for_stripe,
1615                    !list_empty(&conf->inactive_list),
1616                    conf->device_lock,
1617                    );
1618        osh = get_free_stripe(conf);
1619        spin_unlock_irq(&conf->device_lock);
1620        atomic_set(&nsh->count, 1);
1621        for(i=0; i<conf->pool_size; i++)
1622            nsh->dev[i].page = osh->dev[i].page;
1623        for( ; i<newsize; i++)
1624            nsh->dev[i].page = NULL;
1625        kmem_cache_free(conf->slab_cache, osh);
1626    }
1627    kmem_cache_destroy(conf->slab_cache);
1628
1629    /* Step 3.
1630     * At this point, we are holding all the stripes so the array
1631     * is completely stalled, so now is a good time to resize
1632     * conf->disks and the scribble region
1633     */
1634    ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1635    if (ndisks) {
1636        for (i=0; i<conf->raid_disks; i++)
1637            ndisks[i] = conf->disks[i];
1638        kfree(conf->disks);
1639        conf->disks = ndisks;
1640    } else
1641        err = -ENOMEM;
1642
1643    get_online_cpus();
1644    conf->scribble_len = scribble_len(newsize);
1645    for_each_present_cpu(cpu) {
1646        struct raid5_percpu *percpu;
1647        void *scribble;
1648
1649        percpu = per_cpu_ptr(conf->percpu, cpu);
1650        scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1651
1652        if (scribble) {
1653            kfree(percpu->scribble);
1654            percpu->scribble = scribble;
1655        } else {
1656            err = -ENOMEM;
1657            break;
1658        }
1659    }
1660    put_online_cpus();
1661
1662    /* Step 4, return new stripes to service */
1663    while(!list_empty(&newstripes)) {
1664        nsh = list_entry(newstripes.next, struct stripe_head, lru);
1665        list_del_init(&nsh->lru);
1666
1667        for (i=conf->raid_disks; i < newsize; i++)
1668            if (nsh->dev[i].page == NULL) {
1669                struct page *p = alloc_page(GFP_NOIO);
1670                nsh->dev[i].page = p;
1671                if (!p)
1672                    err = -ENOMEM;
1673            }
1674        release_stripe(nsh);
1675    }
1676    /* critical section pass, GFP_NOIO no longer needed */
1677
1678    conf->slab_cache = sc;
1679    conf->active_name = 1-conf->active_name;
1680    conf->pool_size = newsize;
1681    return err;
1682}
1683
1684static int drop_one_stripe(struct r5conf *conf)
1685{
1686    struct stripe_head *sh;
1687
1688    spin_lock_irq(&conf->device_lock);
1689    sh = get_free_stripe(conf);
1690    spin_unlock_irq(&conf->device_lock);
1691    if (!sh)
1692        return 0;
1693    BUG_ON(atomic_read(&sh->count));
1694    shrink_buffers(sh);
1695    kmem_cache_free(conf->slab_cache, sh);
1696    atomic_dec(&conf->active_stripes);
1697    return 1;
1698}
1699
1700static void shrink_stripes(struct r5conf *conf)
1701{
1702    while (drop_one_stripe(conf))
1703        ;
1704
1705    if (conf->slab_cache)
1706        kmem_cache_destroy(conf->slab_cache);
1707    conf->slab_cache = NULL;
1708}
1709
1710static void raid5_end_read_request(struct bio * bi, int error)
1711{
1712    struct stripe_head *sh = bi->bi_private;
1713    struct r5conf *conf = sh->raid_conf;
1714    int disks = sh->disks, i;
1715    int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1716    char b[BDEVNAME_SIZE];
1717    struct md_rdev *rdev = NULL;
1718    sector_t s;
1719
1720    for (i=0 ; i<disks; i++)
1721        if (bi == &sh->dev[i].req)
1722            break;
1723
1724    pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1725        (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1726        uptodate);
1727    if (i == disks) {
1728        BUG();
1729        return;
1730    }
1731    if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1732        /* If replacement finished while this request was outstanding,
1733         * 'replacement' might be NULL already.
1734         * In that case it moved down to 'rdev'.
1735         * rdev is not removed until all requests are finished.
1736         */
1737        rdev = conf->disks[i].replacement;
1738    if (!rdev)
1739        rdev = conf->disks[i].rdev;
1740
1741    if (use_new_offset(conf, sh))
1742        s = sh->sector + rdev->new_data_offset;
1743    else
1744        s = sh->sector + rdev->data_offset;
1745    if (uptodate) {
1746        set_bit(R5_UPTODATE, &sh->dev[i].flags);
1747        if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1748            /* Note that this cannot happen on a
1749             * replacement device. We just fail those on
1750             * any error
1751             */
1752            printk_ratelimited(
1753                KERN_INFO
1754                "md/raid:%s: read error corrected"
1755                " (%lu sectors at %llu on %s)\n",
1756                mdname(conf->mddev), STRIPE_SECTORS,
1757                (unsigned long long)s,
1758                bdevname(rdev->bdev, b));
1759            atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1760            clear_bit(R5_ReadError, &sh->dev[i].flags);
1761            clear_bit(R5_ReWrite, &sh->dev[i].flags);
1762        } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
1763            clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1764
1765        if (atomic_read(&rdev->read_errors))
1766            atomic_set(&rdev->read_errors, 0);
1767    } else {
1768        const char *bdn = bdevname(rdev->bdev, b);
1769        int retry = 0;
1770        int set_bad = 0;
1771
1772        clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1773        atomic_inc(&rdev->read_errors);
1774        if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
1775            printk_ratelimited(
1776                KERN_WARNING
1777                "md/raid:%s: read error on replacement device "
1778                "(sector %llu on %s).\n",
1779                mdname(conf->mddev),
1780                (unsigned long long)s,
1781                bdn);
1782        else if (conf->mddev->degraded >= conf->max_degraded) {
1783            set_bad = 1;
1784            printk_ratelimited(
1785                KERN_WARNING
1786                "md/raid:%s: read error not correctable "
1787                "(sector %llu on %s).\n",
1788                mdname(conf->mddev),
1789                (unsigned long long)s,
1790                bdn);
1791        } else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
1792            /* Oh, no!!! */
1793            set_bad = 1;
1794            printk_ratelimited(
1795                KERN_WARNING
1796                "md/raid:%s: read error NOT corrected!! "
1797                "(sector %llu on %s).\n",
1798                mdname(conf->mddev),
1799                (unsigned long long)s,
1800                bdn);
1801        } else if (atomic_read(&rdev->read_errors)
1802             > conf->max_nr_stripes)
1803            printk(KERN_WARNING
1804                   "md/raid:%s: Too many read errors, failing device %s.\n",
1805                   mdname(conf->mddev), bdn);
1806        else
1807            retry = 1;
1808        if (retry)
1809            if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1810                set_bit(R5_ReadError, &sh->dev[i].flags);
1811                clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1812            } else
1813                set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1814        else {
1815            clear_bit(R5_ReadError, &sh->dev[i].flags);
1816            clear_bit(R5_ReWrite, &sh->dev[i].flags);
1817            if (!(set_bad
1818                  && test_bit(In_sync, &rdev->flags)
1819                  && rdev_set_badblocks(
1820                      rdev, sh->sector, STRIPE_SECTORS, 0)))
1821                md_error(conf->mddev, rdev);
1822        }
1823    }
1824    rdev_dec_pending(rdev, conf->mddev);
1825    clear_bit(R5_LOCKED, &sh->dev[i].flags);
1826    set_bit(STRIPE_HANDLE, &sh->state);
1827    release_stripe(sh);
1828}
1829
1830static void raid5_end_write_request(struct bio *bi, int error)
1831{
1832    struct stripe_head *sh = bi->bi_private;
1833    struct r5conf *conf = sh->raid_conf;
1834    int disks = sh->disks, i;
1835    struct md_rdev *uninitialized_var(rdev);
1836    int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1837    sector_t first_bad;
1838    int bad_sectors;
1839    int replacement = 0;
1840
1841    for (i = 0 ; i < disks; i++) {
1842        if (bi == &sh->dev[i].req) {
1843            rdev = conf->disks[i].rdev;
1844            break;
1845        }
1846        if (bi == &sh->dev[i].rreq) {
1847            rdev = conf->disks[i].replacement;
1848            if (rdev)
1849                replacement = 1;
1850            else
1851                /* rdev was removed and 'replacement'
1852                 * replaced it. rdev is not removed
1853                 * until all requests are finished.
1854                 */
1855                rdev = conf->disks[i].rdev;
1856            break;
1857        }
1858    }
1859    pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1860        (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1861        uptodate);
1862    if (i == disks) {
1863        BUG();
1864        return;
1865    }
1866
1867    if (replacement) {
1868        if (!uptodate)
1869            md_error(conf->mddev, rdev);
1870        else if (is_badblock(rdev, sh->sector,
1871                     STRIPE_SECTORS,
1872                     &first_bad, &bad_sectors))
1873            set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
1874    } else {
1875        if (!uptodate) {
1876            set_bit(WriteErrorSeen, &rdev->flags);
1877            set_bit(R5_WriteError, &sh->dev[i].flags);
1878            if (!test_and_set_bit(WantReplacement, &rdev->flags))
1879                set_bit(MD_RECOVERY_NEEDED,
1880                    &rdev->mddev->recovery);
1881        } else if (is_badblock(rdev, sh->sector,
1882                       STRIPE_SECTORS,
1883                       &first_bad, &bad_sectors))
1884            set_bit(R5_MadeGood, &sh->dev[i].flags);
1885    }
1886    rdev_dec_pending(rdev, conf->mddev);
1887
1888    if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
1889        clear_bit(R5_LOCKED, &sh->dev[i].flags);
1890    set_bit(STRIPE_HANDLE, &sh->state);
1891    release_stripe(sh);
1892}
1893
1894static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1895    
1896static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1897{
1898    struct r5dev *dev = &sh->dev[i];
1899
1900    bio_init(&dev->req);
1901    dev->req.bi_io_vec = &dev->vec;
1902    dev->req.bi_vcnt++;
1903    dev->req.bi_max_vecs++;
1904    dev->req.bi_private = sh;
1905    dev->vec.bv_page = dev->page;
1906
1907    bio_init(&dev->rreq);
1908    dev->rreq.bi_io_vec = &dev->rvec;
1909    dev->rreq.bi_vcnt++;
1910    dev->rreq.bi_max_vecs++;
1911    dev->rreq.bi_private = sh;
1912    dev->rvec.bv_page = dev->page;
1913
1914    dev->flags = 0;
1915    dev->sector = compute_blocknr(sh, i, previous);
1916}
1917
1918static void error(struct mddev *mddev, struct md_rdev *rdev)
1919{
1920    char b[BDEVNAME_SIZE];
1921    struct r5conf *conf = mddev->private;
1922    unsigned long flags;
1923    pr_debug("raid456: error called\n");
1924
1925    spin_lock_irqsave(&conf->device_lock, flags);
1926    clear_bit(In_sync, &rdev->flags);
1927    mddev->degraded = calc_degraded(conf);
1928    spin_unlock_irqrestore(&conf->device_lock, flags);
1929    set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1930
1931    set_bit(Blocked, &rdev->flags);
1932    set_bit(Faulty, &rdev->flags);
1933    set_bit(MD_CHANGE_DEVS, &mddev->flags);
1934    printk(KERN_ALERT
1935           "md/raid:%s: Disk failure on %s, disabling device.\n"
1936           "md/raid:%s: Operation continuing on %d devices.\n",
1937           mdname(mddev),
1938           bdevname(rdev->bdev, b),
1939           mdname(mddev),
1940           conf->raid_disks - mddev->degraded);
1941}
1942
1943/*
1944 * Input: a 'big' sector number,
1945 * Output: index of the data and parity disk, and the sector # in them.
1946 */
1947static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
1948                     int previous, int *dd_idx,
1949                     struct stripe_head *sh)
1950{
1951    sector_t stripe, stripe2;
1952    sector_t chunk_number;
1953    unsigned int chunk_offset;
1954    int pd_idx, qd_idx;
1955    int ddf_layout = 0;
1956    sector_t new_sector;
1957    int algorithm = previous ? conf->prev_algo
1958                 : conf->algorithm;
1959    int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1960                     : conf->chunk_sectors;
1961    int raid_disks = previous ? conf->previous_raid_disks
1962                  : conf->raid_disks;
1963    int data_disks = raid_disks - conf->max_degraded;
1964
1965    /* First compute the information on this sector */
1966
1967    /*
1968     * Compute the chunk number and the sector offset inside the chunk
1969     */
1970    chunk_offset = sector_div(r_sector, sectors_per_chunk);
1971    chunk_number = r_sector;
1972
1973    /*
1974     * Compute the stripe number
1975     */
1976    stripe = chunk_number;
1977    *dd_idx = sector_div(stripe, data_disks);
1978    stripe2 = stripe;
1979    /*
1980     * Select the parity disk based on the user selected algorithm.
1981     */
1982    pd_idx = qd_idx = -1;
1983    switch(conf->level) {
1984    case 4:
1985        pd_idx = data_disks;
1986        break;
1987    case 5:
1988        switch (algorithm) {
1989        case ALGORITHM_LEFT_ASYMMETRIC:
1990            pd_idx = data_disks - sector_div(stripe2, raid_disks);
1991            if (*dd_idx >= pd_idx)
1992                (*dd_idx)++;
1993            break;
1994        case ALGORITHM_RIGHT_ASYMMETRIC:
1995            pd_idx = sector_div(stripe2, raid_disks);
1996            if (*dd_idx >= pd_idx)
1997                (*dd_idx)++;
1998            break;
1999        case ALGORITHM_LEFT_SYMMETRIC:
2000            pd_idx = data_disks - sector_div(stripe2, raid_disks);
2001            *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2002            break;
2003        case ALGORITHM_RIGHT_SYMMETRIC:
2004            pd_idx = sector_div(stripe2, raid_disks);
2005            *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2006            break;
2007        case ALGORITHM_PARITY_0:
2008            pd_idx = 0;
2009            (*dd_idx)++;
2010            break;
2011        case ALGORITHM_PARITY_N:
2012            pd_idx = data_disks;
2013            break;
2014        default:
2015            BUG();
2016        }
2017        break;
2018    case 6:
2019
2020        switch (algorithm) {
2021        case ALGORITHM_LEFT_ASYMMETRIC:
2022            pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2023            qd_idx = pd_idx + 1;
2024            if (pd_idx == raid_disks-1) {
2025                (*dd_idx)++; /* Q D D D P */
2026                qd_idx = 0;
2027            } else if (*dd_idx >= pd_idx)
2028                (*dd_idx) += 2; /* D D P Q D */
2029            break;
2030        case ALGORITHM_RIGHT_ASYMMETRIC:
2031            pd_idx = sector_div(stripe2, raid_disks);
2032            qd_idx = pd_idx + 1;
2033            if (pd_idx == raid_disks-1) {
2034                (*dd_idx)++; /* Q D D D P */
2035                qd_idx = 0;
2036            } else if (*dd_idx >= pd_idx)
2037                (*dd_idx) += 2; /* D D P Q D */
2038            break;
2039        case ALGORITHM_LEFT_SYMMETRIC:
2040            pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2041            qd_idx = (pd_idx + 1) % raid_disks;
2042            *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2043            break;
2044        case ALGORITHM_RIGHT_SYMMETRIC:
2045            pd_idx = sector_div(stripe2, raid_disks);
2046            qd_idx = (pd_idx + 1) % raid_disks;
2047            *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2048            break;
2049
2050        case ALGORITHM_PARITY_0:
2051            pd_idx = 0;
2052            qd_idx = 1;
2053            (*dd_idx) += 2;
2054            break;
2055        case ALGORITHM_PARITY_N:
2056            pd_idx = data_disks;
2057            qd_idx = data_disks + 1;
2058            break;
2059
2060        case ALGORITHM_ROTATING_ZERO_RESTART:
2061            /* Exactly the same as RIGHT_ASYMMETRIC, but or
2062             * of blocks for computing Q is different.
2063             */
2064            pd_idx = sector_div(stripe2, raid_disks);
2065            qd_idx = pd_idx + 1;
2066            if (pd_idx == raid_disks-1) {
2067                (*dd_idx)++; /* Q D D D P */
2068                qd_idx = 0;
2069            } else if (*dd_idx >= pd_idx)
2070                (*dd_idx) += 2; /* D D P Q D */
2071            ddf_layout = 1;
2072            break;
2073
2074        case ALGORITHM_ROTATING_N_RESTART:
2075            /* Same a left_asymmetric, by first stripe is
2076             * D D D P Q rather than
2077             * Q D D D P
2078             */
2079            stripe2 += 1;
2080            pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2081            qd_idx = pd_idx + 1;
2082            if (pd_idx == raid_disks-1) {
2083                (*dd_idx)++; /* Q D D D P */
2084                qd_idx = 0;
2085            } else if (*dd_idx >= pd_idx)
2086                (*dd_idx) += 2; /* D D P Q D */
2087            ddf_layout = 1;
2088            break;
2089
2090        case ALGORITHM_ROTATING_N_CONTINUE:
2091            /* Same as left_symmetric but Q is before P */
2092            pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2093            qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2094            *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2095            ddf_layout = 1;
2096            break;
2097
2098        case ALGORITHM_LEFT_ASYMMETRIC_6:
2099            /* RAID5 left_asymmetric, with Q on last device */
2100            pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2101            if (*dd_idx >= pd_idx)
2102                (*dd_idx)++;
2103            qd_idx = raid_disks - 1;
2104            break;
2105
2106        case ALGORITHM_RIGHT_ASYMMETRIC_6:
2107            pd_idx = sector_div(stripe2, raid_disks-1);
2108            if (*dd_idx >= pd_idx)
2109                (*dd_idx)++;
2110            qd_idx = raid_disks - 1;
2111            break;
2112
2113        case ALGORITHM_LEFT_SYMMETRIC_6:
2114            pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2115            *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2116            qd_idx = raid_disks - 1;
2117            break;
2118
2119        case ALGORITHM_RIGHT_SYMMETRIC_6:
2120            pd_idx = sector_div(stripe2, raid_disks-1);
2121            *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2122            qd_idx = raid_disks - 1;
2123            break;
2124
2125        case ALGORITHM_PARITY_0_6:
2126            pd_idx = 0;
2127            (*dd_idx)++;
2128            qd_idx = raid_disks - 1;
2129            break;
2130
2131        default:
2132            BUG();
2133        }
2134        break;
2135    }
2136
2137    if (sh) {
2138        sh->pd_idx = pd_idx;
2139        sh->qd_idx = qd_idx;
2140        sh->ddf_layout = ddf_layout;
2141    }
2142    /*
2143     * Finally, compute the new sector number
2144     */
2145    new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2146    return new_sector;
2147}
2148
2149
2150static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
2151{
2152    struct r5conf *conf = sh->raid_conf;
2153    int raid_disks = sh->disks;
2154    int data_disks = raid_disks - conf->max_degraded;
2155    sector_t new_sector = sh->sector, check;
2156    int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2157                     : conf->chunk_sectors;
2158    int algorithm = previous ? conf->prev_algo
2159                 : conf->algorithm;
2160    sector_t stripe;
2161    int chunk_offset;
2162    sector_t chunk_number;
2163    int dummy1, dd_idx = i;
2164    sector_t r_sector;
2165    struct stripe_head sh2;
2166
2167
2168    chunk_offset = sector_div(new_sector, sectors_per_chunk);
2169    stripe = new_sector;
2170
2171    if (i == sh->pd_idx)
2172        return 0;
2173    switch(conf->level) {
2174    case 4: break;
2175    case 5:
2176        switch (algorithm) {
2177        case ALGORITHM_LEFT_ASYMMETRIC:
2178        case ALGORITHM_RIGHT_ASYMMETRIC:
2179            if (i > sh->pd_idx)
2180                i--;
2181            break;
2182        case ALGORITHM_LEFT_SYMMETRIC:
2183        case ALGORITHM_RIGHT_SYMMETRIC:
2184            if (i < sh->pd_idx)
2185                i += raid_disks;
2186            i -= (sh->pd_idx + 1);
2187            break;
2188        case ALGORITHM_PARITY_0:
2189            i -= 1;
2190            break;
2191        case ALGORITHM_PARITY_N:
2192            break;
2193        default:
2194            BUG();
2195        }
2196        break;
2197    case 6:
2198        if (i == sh->qd_idx)
2199            return 0; /* It is the Q disk */
2200        switch (algorithm) {
2201        case ALGORITHM_LEFT_ASYMMETRIC:
2202        case ALGORITHM_RIGHT_ASYMMETRIC:
2203        case ALGORITHM_ROTATING_ZERO_RESTART:
2204        case ALGORITHM_ROTATING_N_RESTART:
2205            if (sh->pd_idx == raid_disks-1)
2206                i--; /* Q D D D P */
2207            else if (i > sh->pd_idx)
2208                i -= 2; /* D D P Q D */
2209            break;
2210        case ALGORITHM_LEFT_SYMMETRIC:
2211        case ALGORITHM_RIGHT_SYMMETRIC:
2212            if (sh->pd_idx == raid_disks-1)
2213                i--; /* Q D D D P */
2214            else {
2215                /* D D P Q D */
2216                if (i < sh->pd_idx)
2217                    i += raid_disks;
2218                i -= (sh->pd_idx + 2);
2219            }
2220            break;
2221        case ALGORITHM_PARITY_0:
2222            i -= 2;
2223            break;
2224        case ALGORITHM_PARITY_N:
2225            break;
2226        case ALGORITHM_ROTATING_N_CONTINUE:
2227            /* Like left_symmetric, but P is before Q */
2228            if (sh->pd_idx == 0)
2229                i--; /* P D D D Q */
2230            else {
2231                /* D D Q P D */
2232                if (i < sh->pd_idx)
2233                    i += raid_disks;
2234                i -= (sh->pd_idx + 1);
2235            }
2236            break;
2237        case ALGORITHM_LEFT_ASYMMETRIC_6:
2238        case ALGORITHM_RIGHT_ASYMMETRIC_6:
2239            if (i > sh->pd_idx)
2240                i--;
2241            break;
2242        case ALGORITHM_LEFT_SYMMETRIC_6:
2243        case ALGORITHM_RIGHT_SYMMETRIC_6:
2244            if (i < sh->pd_idx)
2245                i += data_disks + 1;
2246            i -= (sh->pd_idx + 1);
2247            break;
2248        case ALGORITHM_PARITY_0_6:
2249            i -= 1;
2250            break;
2251        default:
2252            BUG();
2253        }
2254        break;
2255    }
2256
2257    chunk_number = stripe * data_disks + i;
2258    r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2259
2260    check = raid5_compute_sector(conf, r_sector,
2261                     previous, &dummy1, &sh2);
2262    if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2263        || sh2.qd_idx != sh->qd_idx) {
2264        printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2265               mdname(conf->mddev));
2266        return 0;
2267    }
2268    return r_sector;
2269}
2270
2271
2272static void
2273schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2274             int rcw, int expand)
2275{
2276    int i, pd_idx = sh->pd_idx, disks = sh->disks;
2277    struct r5conf *conf = sh->raid_conf;
2278    int level = conf->level;
2279
2280    if (rcw) {
2281        /* if we are not expanding this is a proper write request, and
2282         * there will be bios with new data to be drained into the
2283         * stripe cache
2284         */
2285        if (!expand) {
2286            sh->reconstruct_state = reconstruct_state_drain_run;
2287            set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2288        } else
2289            sh->reconstruct_state = reconstruct_state_run;
2290
2291        set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2292
2293        for (i = disks; i--; ) {
2294            struct r5dev *dev = &sh->dev[i];
2295
2296            if (dev->towrite) {
2297                set_bit(R5_LOCKED, &dev->flags);
2298                set_bit(R5_Wantdrain, &dev->flags);
2299                if (!expand)
2300                    clear_bit(R5_UPTODATE, &dev->flags);
2301                s->locked++;
2302            }
2303        }
2304        if (s->locked + conf->max_degraded == disks)
2305            if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2306                atomic_inc(&conf->pending_full_writes);
2307    } else {
2308        BUG_ON(level == 6);
2309        BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2310            test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2311
2312        sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2313        set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2314        set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2315        set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2316
2317        for (i = disks; i--; ) {
2318            struct r5dev *dev = &sh->dev[i];
2319            if (i == pd_idx)
2320                continue;
2321
2322            if (dev->towrite &&
2323                (test_bit(R5_UPTODATE, &dev->flags) ||
2324                 test_bit(R5_Wantcompute, &dev->flags))) {
2325                set_bit(R5_Wantdrain, &dev->flags);
2326                set_bit(R5_LOCKED, &dev->flags);
2327                clear_bit(R5_UPTODATE, &dev->flags);
2328                s->locked++;
2329            }
2330        }
2331    }
2332
2333    /* keep the parity disk(s) locked while asynchronous operations
2334     * are in flight
2335     */
2336    set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2337    clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2338    s->locked++;
2339
2340    if (level == 6) {
2341        int qd_idx = sh->qd_idx;
2342        struct r5dev *dev = &sh->dev[qd_idx];
2343
2344        set_bit(R5_LOCKED, &dev->flags);
2345        clear_bit(R5_UPTODATE, &dev->flags);
2346        s->locked++;
2347    }
2348
2349    pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2350        __func__, (unsigned long long)sh->sector,
2351        s->locked, s->ops_request);
2352}
2353
2354/*
2355 * Each stripe/dev can have one or more bion attached.
2356 * toread/towrite point to the first in a chain.
2357 * The bi_next chain must be in order.
2358 */
2359static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2360{
2361    struct bio **bip;
2362    struct r5conf *conf = sh->raid_conf;
2363    int firstwrite=0;
2364
2365    pr_debug("adding bi b#%llu to stripe s#%llu\n",
2366        (unsigned long long)bi->bi_sector,
2367        (unsigned long long)sh->sector);
2368
2369    /*
2370     * If several bio share a stripe. The bio bi_phys_segments acts as a
2371     * reference count to avoid race. The reference count should already be
2372     * increased before this function is called (for example, in
2373     * make_request()), so other bio sharing this stripe will not free the
2374     * stripe. If a stripe is owned by one stripe, the stripe lock will
2375     * protect it.
2376     */
2377    spin_lock_irq(&sh->stripe_lock);
2378    if (forwrite) {
2379        bip = &sh->dev[dd_idx].towrite;
2380        if (*bip == NULL)
2381            firstwrite = 1;
2382    } else
2383        bip = &sh->dev[dd_idx].toread;
2384    while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2385        if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2386            goto overlap;
2387        bip = & (*bip)->bi_next;
2388    }
2389    if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2390        goto overlap;
2391
2392    BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2393    if (*bip)
2394        bi->bi_next = *bip;
2395    *bip = bi;
2396    raid5_inc_bi_active_stripes(bi);
2397
2398    if (forwrite) {
2399        /* check if page is covered */
2400        sector_t sector = sh->dev[dd_idx].sector;
2401        for (bi=sh->dev[dd_idx].towrite;
2402             sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2403                 bi && bi->bi_sector <= sector;
2404             bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2405            if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2406                sector = bi->bi_sector + (bi->bi_size>>9);
2407        }
2408        if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2409            set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2410    }
2411    spin_unlock_irq(&sh->stripe_lock);
2412
2413    pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2414        (unsigned long long)(*bip)->bi_sector,
2415        (unsigned long long)sh->sector, dd_idx);
2416
2417    if (conf->mddev->bitmap && firstwrite) {
2418        bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2419                  STRIPE_SECTORS, 0);
2420        sh->bm_seq = conf->seq_flush+1;
2421        set_bit(STRIPE_BIT_DELAY, &sh->state);
2422    }
2423    return 1;
2424
2425 overlap:
2426    set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2427    spin_unlock_irq(&sh->stripe_lock);
2428    return 0;
2429}
2430
2431static void end_reshape(struct r5conf *conf);
2432
2433static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
2434                struct stripe_head *sh)
2435{
2436    int sectors_per_chunk =
2437        previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2438    int dd_idx;
2439    int chunk_offset = sector_div(stripe, sectors_per_chunk);
2440    int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2441
2442    raid5_compute_sector(conf,
2443                 stripe * (disks - conf->max_degraded)
2444                 *sectors_per_chunk + chunk_offset,
2445                 previous,
2446                 &dd_idx, sh);
2447}
2448
2449static void
2450handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2451                struct stripe_head_state *s, int disks,
2452                struct bio **return_bi)
2453{
2454    int i;
2455    for (i = disks; i--; ) {
2456        struct bio *bi;
2457        int bitmap_end = 0;
2458
2459        if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2460            struct md_rdev *rdev;
2461            rcu_read_lock();
2462            rdev = rcu_dereference(conf->disks[i].rdev);
2463            if (rdev && test_bit(In_sync, &rdev->flags))
2464                atomic_inc(&rdev->nr_pending);
2465            else
2466                rdev = NULL;
2467            rcu_read_unlock();
2468            if (rdev) {
2469                if (!rdev_set_badblocks(
2470                        rdev,
2471                        sh->sector,
2472                        STRIPE_SECTORS, 0))
2473                    md_error(conf->mddev, rdev);
2474                rdev_dec_pending(rdev, conf->mddev);
2475            }
2476        }
2477        spin_lock_irq(&sh->stripe_lock);
2478        /* fail all writes first */
2479        bi = sh->dev[i].towrite;
2480        sh->dev[i].towrite = NULL;
2481        spin_unlock_irq(&sh->stripe_lock);
2482        if (bi) {
2483            s->to_write--;
2484            bitmap_end = 1;
2485        }
2486
2487        if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2488            wake_up(&conf->wait_for_overlap);
2489
2490        while (bi && bi->bi_sector <
2491            sh->dev[i].sector + STRIPE_SECTORS) {
2492            struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2493            clear_bit(BIO_UPTODATE, &bi->bi_flags);
2494            if (!raid5_dec_bi_active_stripes(bi)) {
2495                md_write_end(conf->mddev);
2496                bi->bi_next = *return_bi;
2497                *return_bi = bi;
2498            }
2499            bi = nextbi;
2500        }
2501        if (bitmap_end)
2502            bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2503                STRIPE_SECTORS, 0, 0);
2504        bitmap_end = 0;
2505        /* and fail all 'written' */
2506        bi = sh->dev[i].written;
2507        sh->dev[i].written = NULL;
2508        if (bi) bitmap_end = 1;
2509        while (bi && bi->bi_sector <
2510               sh->dev[i].sector + STRIPE_SECTORS) {
2511            struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2512            clear_bit(BIO_UPTODATE, &bi->bi_flags);
2513            if (!raid5_dec_bi_active_stripes(bi)) {
2514                md_write_end(conf->mddev);
2515                bi->bi_next = *return_bi;
2516                *return_bi = bi;
2517            }
2518            bi = bi2;
2519        }
2520
2521        /* fail any reads if this device is non-operational and
2522         * the data has not reached the cache yet.
2523         */
2524        if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2525            (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2526              test_bit(R5_ReadError, &sh->dev[i].flags))) {
2527            bi = sh->dev[i].toread;
2528            sh->dev[i].toread = NULL;
2529            if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2530                wake_up(&conf->wait_for_overlap);
2531            if (bi) s->to_read--;
2532            while (bi && bi->bi_sector <
2533                   sh->dev[i].sector + STRIPE_SECTORS) {
2534                struct bio *nextbi =
2535                    r5_next_bio(bi, sh->dev[i].sector);
2536                clear_bit(BIO_UPTODATE, &bi->bi_flags);
2537                if (!raid5_dec_bi_active_stripes(bi)) {
2538                    bi->bi_next = *return_bi;
2539                    *return_bi = bi;
2540                }
2541                bi = nextbi;
2542            }
2543        }
2544        if (bitmap_end)
2545            bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2546                    STRIPE_SECTORS, 0, 0);
2547        /* If we were in the middle of a write the parity block might
2548         * still be locked - so just clear all R5_LOCKED flags
2549         */
2550        clear_bit(R5_LOCKED, &sh->dev[i].flags);
2551    }
2552
2553    if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2554        if (atomic_dec_and_test(&conf->pending_full_writes))
2555            md_wakeup_thread(conf->mddev->thread);
2556}
2557
2558static void
2559handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
2560           struct stripe_head_state *s)
2561{
2562    int abort = 0;
2563    int i;
2564
2565    clear_bit(STRIPE_SYNCING, &sh->state);
2566    s->syncing = 0;
2567    s->replacing = 0;
2568    /* There is nothing more to do for sync/check/repair.
2569     * Don't even need to abort as that is handled elsewhere
2570     * if needed, and not always wanted e.g. if there is a known
2571     * bad block here.
2572     * For recover/replace we need to record a bad block on all
2573     * non-sync devices, or abort the recovery
2574     */
2575    if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
2576        /* During recovery devices cannot be removed, so
2577         * locking and refcounting of rdevs is not needed
2578         */
2579        for (i = 0; i < conf->raid_disks; i++) {
2580            struct md_rdev *rdev = conf->disks[i].rdev;
2581            if (rdev
2582                && !test_bit(Faulty, &rdev->flags)
2583                && !test_bit(In_sync, &rdev->flags)
2584                && !rdev_set_badblocks(rdev, sh->sector,
2585                           STRIPE_SECTORS, 0))
2586                abort = 1;
2587            rdev = conf->disks[i].replacement;
2588            if (rdev
2589                && !test_bit(Faulty, &rdev->flags)
2590                && !test_bit(In_sync, &rdev->flags)
2591                && !rdev_set_badblocks(rdev, sh->sector,
2592                           STRIPE_SECTORS, 0))
2593                abort = 1;
2594        }
2595        if (abort)
2596            conf->recovery_disabled =
2597                conf->mddev->recovery_disabled;
2598    }
2599    md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
2600}
2601
2602static int want_replace(struct stripe_head *sh, int disk_idx)
2603{
2604    struct md_rdev *rdev;
2605    int rv = 0;
2606    /* Doing recovery so rcu locking not required */
2607    rdev = sh->raid_conf->disks[disk_idx].replacement;
2608    if (rdev
2609        && !test_bit(Faulty, &rdev->flags)
2610        && !test_bit(In_sync, &rdev->flags)
2611        && (rdev->recovery_offset <= sh->sector
2612        || rdev->mddev->recovery_cp <= sh->sector))
2613        rv = 1;
2614
2615    return rv;
2616}
2617
2618/* fetch_block - checks the given member device to see if its data needs
2619 * to be read or computed to satisfy a request.
2620 *
2621 * Returns 1 when no more member devices need to be checked, otherwise returns
2622 * 0 to tell the loop in handle_stripe_fill to continue
2623 */
2624static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2625               int disk_idx, int disks)
2626{
2627    struct r5dev *dev = &sh->dev[disk_idx];
2628    struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2629                  &sh->dev[s->failed_num[1]] };
2630
2631    /* is the data in this block needed, and can we get it? */
2632    if (!test_bit(R5_LOCKED, &dev->flags) &&
2633        !test_bit(R5_UPTODATE, &dev->flags) &&
2634        (dev->toread ||
2635         (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2636         s->syncing || s->expanding ||
2637         (s->replacing && want_replace(sh, disk_idx)) ||
2638         (s->failed >= 1 && fdev[0]->toread) ||
2639         (s->failed >= 2 && fdev[1]->toread) ||
2640         (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2641          !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2642         (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2643        /* we would like to get this block, possibly by computing it,
2644         * otherwise read it if the backing disk is insync
2645         */
2646        BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2647        BUG_ON(test_bit(R5_Wantread, &dev->flags));
2648        if ((s->uptodate == disks - 1) &&
2649            (s->failed && (disk_idx == s->failed_num[0] ||
2650                   disk_idx == s->failed_num[1]))) {
2651            /* have disk failed, and we're requested to fetch it;
2652             * do compute it
2653             */
2654            pr_debug("Computing stripe %llu block %d\n",
2655                   (unsigned long long)sh->sector, disk_idx);
2656            set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2657            set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2658            set_bit(R5_Wantcompute, &dev->flags);
2659            sh->ops.target = disk_idx;
2660            sh->ops.target2 = -1; /* no 2nd target */
2661            s->req_compute = 1;
2662            /* Careful: from this point on 'uptodate' is in the eye
2663             * of raid_run_ops which services 'compute' operations
2664             * before writes. R5_Wantcompute flags a block that will
2665             * be R5_UPTODATE by the time it is needed for a
2666             * subsequent operation.
2667             */
2668            s->uptodate++;
2669            return 1;
2670        } else if (s->uptodate == disks-2 && s->failed >= 2) {
2671            /* Computing 2-failure is *very* expensive; only
2672             * do it if failed >= 2
2673             */
2674            int other;
2675            for (other = disks; other--; ) {
2676                if (other == disk_idx)
2677                    continue;
2678                if (!test_bit(R5_UPTODATE,
2679                      &sh->dev[other].flags))
2680                    break;
2681            }
2682            BUG_ON(other < 0);
2683            pr_debug("Computing stripe %llu blocks %d,%d\n",
2684                   (unsigned long long)sh->sector,
2685                   disk_idx, other);
2686            set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2687            set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2688            set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2689            set_bit(R5_Wantcompute, &sh->dev[other].flags);
2690            sh->ops.target = disk_idx;
2691            sh->ops.target2 = other;
2692            s->uptodate += 2;
2693            s->req_compute = 1;
2694            return 1;
2695        } else if (test_bit(R5_Insync, &dev->flags)) {
2696            set_bit(R5_LOCKED, &dev->flags);
2697            set_bit(R5_Wantread, &dev->flags);
2698            s->locked++;
2699            pr_debug("Reading block %d (sync=%d)\n",
2700                disk_idx, s->syncing);
2701        }
2702    }
2703
2704    return 0;
2705}
2706
2707/**
2708 * handle_stripe_fill - read or compute data to satisfy pending requests.
2709 */
2710static void handle_stripe_fill(struct stripe_head *sh,
2711                   struct stripe_head_state *s,
2712                   int disks)
2713{
2714    int i;
2715
2716    /* look for blocks to read/compute, skip this if a compute
2717     * is already in flight, or if the stripe contents are in the
2718     * midst of changing due to a write
2719     */
2720    if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2721        !sh->reconstruct_state)
2722        for (i = disks; i--; )
2723            if (fetch_block(sh, s, i, disks))
2724                break;
2725    set_bit(STRIPE_HANDLE, &sh->state);
2726}
2727
2728
2729/* handle_stripe_clean_event
2730 * any written block on an uptodate or failed drive can be returned.
2731 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2732 * never LOCKED, so we don't need to test 'failed' directly.
2733 */
2734static void handle_stripe_clean_event(struct r5conf *conf,
2735    struct stripe_head *sh, int disks, struct bio **return_bi)
2736{
2737    int i;
2738    struct r5dev *dev;
2739
2740    for (i = disks; i--; )
2741        if (sh->dev[i].written) {
2742            dev = &sh->dev[i];
2743            if (!test_bit(R5_LOCKED, &dev->flags) &&
2744                test_bit(R5_UPTODATE, &dev->flags)) {
2745                /* We can return any write requests */
2746                struct bio *wbi, *wbi2;
2747                pr_debug("Return write for disc %d\n", i);
2748                wbi = dev->written;
2749                dev->written = NULL;
2750                while (wbi && wbi->bi_sector <
2751                    dev->sector + STRIPE_SECTORS) {
2752                    wbi2 = r5_next_bio(wbi, dev->sector);
2753                    if (!raid5_dec_bi_active_stripes(wbi)) {
2754                        md_write_end(conf->mddev);
2755                        wbi->bi_next = *return_bi;
2756                        *return_bi = wbi;
2757                    }
2758                    wbi = wbi2;
2759                }
2760                bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2761                        STRIPE_SECTORS,
2762                     !test_bit(STRIPE_DEGRADED, &sh->state),
2763                        0);
2764            }
2765        }
2766
2767    if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2768        if (atomic_dec_and_test(&conf->pending_full_writes))
2769            md_wakeup_thread(conf->mddev->thread);
2770}
2771
2772static void handle_stripe_dirtying(struct r5conf *conf,
2773                   struct stripe_head *sh,
2774                   struct stripe_head_state *s,
2775                   int disks)
2776{
2777    int rmw = 0, rcw = 0, i;
2778    if (conf->max_degraded == 2) {
2779        /* RAID6 requires 'rcw' in current implementation
2780         * Calculate the real rcw later - for now fake it
2781         * look like rcw is cheaper
2782         */
2783        rcw = 1; rmw = 2;
2784    } else for (i = disks; i--; ) {
2785        /* would I have to read this buffer for read_modify_write */
2786        struct r5dev *dev = &sh->dev[i];
2787        if ((dev->towrite || i == sh->pd_idx) &&
2788            !test_bit(R5_LOCKED, &dev->flags) &&
2789            !(test_bit(R5_UPTODATE, &dev->flags) ||
2790              test_bit(R5_Wantcompute, &dev->flags))) {
2791            if (test_bit(R5_Insync, &dev->flags))
2792                rmw++;
2793            else
2794                rmw += 2*disks; /* cannot read it */
2795        }
2796        /* Would I have to read this buffer for reconstruct_write */
2797        if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2798            !test_bit(R5_LOCKED, &dev->flags) &&
2799            !(test_bit(R5_UPTODATE, &dev->flags) ||
2800            test_bit(R5_Wantcompute, &dev->flags))) {
2801            if (test_bit(R5_Insync, &dev->flags)) rcw++;
2802            else
2803                rcw += 2*disks;
2804        }
2805    }
2806    pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2807        (unsigned long long)sh->sector, rmw, rcw);
2808    set_bit(STRIPE_HANDLE, &sh->state);
2809    if (rmw < rcw && rmw > 0)
2810        /* prefer read-modify-write, but need to get some data */
2811        for (i = disks; i--; ) {
2812            struct r5dev *dev = &sh->dev[i];
2813            if ((dev->towrite || i == sh->pd_idx) &&
2814                !test_bit(R5_LOCKED, &dev->flags) &&
2815                !(test_bit(R5_UPTODATE, &dev->flags) ||
2816                test_bit(R5_Wantcompute, &dev->flags)) &&
2817                test_bit(R5_Insync, &dev->flags)) {
2818                if (
2819                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2820                    pr_debug("Read_old block "
2821                        "%d for r-m-w\n", i);
2822                    set_bit(R5_LOCKED, &dev->flags);
2823                    set_bit(R5_Wantread, &dev->flags);
2824                    s->locked++;
2825                } else {
2826                    set_bit(STRIPE_DELAYED, &sh->state);
2827                    set_bit(STRIPE_HANDLE, &sh->state);
2828                }
2829            }
2830        }
2831    if (rcw <= rmw && rcw > 0) {
2832        /* want reconstruct write, but need to get some data */
2833        rcw = 0;
2834        for (i = disks; i--; ) {
2835            struct r5dev *dev = &sh->dev[i];
2836            if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2837                i != sh->pd_idx && i != sh->qd_idx &&
2838                !test_bit(R5_LOCKED, &dev->flags) &&
2839                !(test_bit(R5_UPTODATE, &dev->flags) ||
2840                  test_bit(R5_Wantcompute, &dev->flags))) {
2841                rcw++;
2842                if (!test_bit(R5_Insync, &dev->flags))
2843                    continue; /* it's a failed drive */
2844                if (
2845                  test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2846                    pr_debug("Read_old block "
2847                        "%d for Reconstruct\n", i);
2848                    set_bit(R5_LOCKED, &dev->flags);
2849                    set_bit(R5_Wantread, &dev->flags);
2850                    s->locked++;
2851                } else {
2852                    set_bit(STRIPE_DELAYED, &sh->state);
2853                    set_bit(STRIPE_HANDLE, &sh->state);
2854                }
2855            }
2856        }
2857    }
2858    /* now if nothing is locked, and if we have enough data,
2859     * we can start a write request
2860     */
2861    /* since handle_stripe can be called at any time we need to handle the
2862     * case where a compute block operation has been submitted and then a
2863     * subsequent call wants to start a write request. raid_run_ops only
2864     * handles the case where compute block and reconstruct are requested
2865     * simultaneously. If this is not the case then new writes need to be
2866     * held off until the compute completes.
2867     */
2868    if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2869        (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2870        !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2871        schedule_reconstruction(sh, s, rcw == 0, 0);
2872}
2873
2874static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
2875                struct stripe_head_state *s, int disks)
2876{
2877    struct r5dev *dev = NULL;
2878
2879    set_bit(STRIPE_HANDLE, &sh->state);
2880
2881    switch (sh->check_state) {
2882    case check_state_idle:
2883        /* start a new check operation if there are no failures */
2884        if (s->failed == 0) {
2885            BUG_ON(s->uptodate != disks);
2886            sh->check_state = check_state_run;
2887            set_bit(STRIPE_OP_CHECK, &s->ops_request);
2888            clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2889            s->uptodate--;
2890            break;
2891        }
2892        dev = &sh->dev[s->failed_num[0]];
2893        /* fall through */
2894    case check_state_compute_result:
2895        sh->check_state = check_state_idle;
2896        if (!dev)
2897            dev = &sh->dev[sh->pd_idx];
2898
2899        /* check that a write has not made the stripe insync */
2900        if (test_bit(STRIPE_INSYNC, &sh->state))
2901            break;
2902
2903        /* either failed parity check, or recovery is happening */
2904        BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2905        BUG_ON(s->uptodate != disks);
2906
2907        set_bit(R5_LOCKED, &dev->flags);
2908        s->locked++;
2909        set_bit(R5_Wantwrite, &dev->flags);
2910
2911        clear_bit(STRIPE_DEGRADED, &sh->state);
2912        set_bit(STRIPE_INSYNC, &sh->state);
2913        break;
2914    case check_state_run:
2915        break; /* we will be called again upon completion */
2916    case check_state_check_result:
2917        sh->check_state = check_state_idle;
2918
2919        /* if a failure occurred during the check operation, leave
2920         * STRIPE_INSYNC not set and let the stripe be handled again
2921         */
2922        if (s->failed)
2923            break;
2924
2925        /* handle a successful check operation, if parity is correct
2926         * we are done. Otherwise update the mismatch count and repair
2927         * parity if !MD_RECOVERY_CHECK
2928         */
2929        if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2930            /* parity is correct (on disc,
2931             * not in buffer any more)
2932             */
2933            set_bit(STRIPE_INSYNC, &sh->state);
2934        else {
2935            conf->mddev->resync_mismatches += STRIPE_SECTORS;
2936            if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2937                /* don't try to repair!! */
2938                set_bit(STRIPE_INSYNC, &sh->state);
2939            else {
2940                sh->check_state = check_state_compute_run;
2941                set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2942                set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2943                set_bit(R5_Wantcompute,
2944                    &sh->dev[sh->pd_idx].flags);
2945                sh->ops.target = sh->pd_idx;
2946                sh->ops.target2 = -1;
2947                s->uptodate++;
2948            }
2949        }
2950        break;
2951    case check_state_compute_run:
2952        break;
2953    default:
2954        printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2955               __func__, sh->check_state,
2956               (unsigned long long) sh->sector);
2957        BUG();
2958    }
2959}
2960
2961
2962static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
2963                  struct stripe_head_state *s,
2964                  int disks)
2965{
2966    int pd_idx = sh->pd_idx;
2967    int qd_idx = sh->qd_idx;
2968    struct r5dev *dev;
2969
2970    set_bit(STRIPE_HANDLE, &sh->state);
2971
2972    BUG_ON(s->failed > 2);
2973
2974    /* Want to check and possibly repair P and Q.
2975     * However there could be one 'failed' device, in which
2976     * case we can only check one of them, possibly using the
2977     * other to generate missing data
2978     */
2979
2980    switch (sh->check_state) {
2981    case check_state_idle:
2982        /* start a new check operation if there are < 2 failures */
2983        if (s->failed == s->q_failed) {
2984            /* The only possible failed device holds Q, so it
2985             * makes sense to check P (If anything else were failed,
2986             * we would have used P to recreate it).
2987             */
2988            sh->check_state = check_state_run;
2989        }
2990        if (!s->q_failed && s->failed < 2) {
2991            /* Q is not failed, and we didn't use it to generate
2992             * anything, so it makes sense to check it
2993             */
2994            if (sh->check_state == check_state_run)
2995                sh->check_state = check_state_run_pq;
2996            else
2997                sh->check_state = check_state_run_q;
2998        }
2999
3000        /* discard potentially stale zero_sum_result */
3001        sh->ops.zero_sum_result = 0;
3002
3003        if (sh->check_state == check_state_run) {
3004            /* async_xor_zero_sum destroys the contents of P */
3005            clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3006            s->uptodate--;
3007        }
3008        if (sh->check_state >= check_state_run &&
3009            sh->check_state <= check_state_run_pq) {
3010            /* async_syndrome_zero_sum preserves P and Q, so
3011             * no need to mark them !uptodate here
3012             */
3013            set_bit(STRIPE_OP_CHECK, &s->ops_request);
3014            break;
3015        }
3016
3017        /* we have 2-disk failure */
3018        BUG_ON(s->failed != 2);
3019        /* fall through */
3020    case check_state_compute_result:
3021        sh->check_state = check_state_idle;
3022
3023        /* check that a write has not made the stripe insync */
3024        if (test_bit(STRIPE_INSYNC, &sh->state))
3025            break;
3026
3027        /* now write out any block on a failed drive,
3028         * or P or Q if they were recomputed
3029         */
3030        BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
3031        if (s->failed == 2) {
3032            dev = &sh->dev[s->failed_num[1]];
3033            s->locked++;
3034            set_bit(R5_LOCKED, &dev->flags);
3035            set_bit(R5_Wantwrite, &dev->flags);
3036        }
3037        if (s->failed >= 1) {
3038            dev = &sh->dev[s->failed_num[0]];
3039            s->locked++;
3040            set_bit(R5_LOCKED, &dev->flags);
3041            set_bit(R5_Wantwrite, &dev->flags);
3042        }
3043        if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
3044            dev = &sh->dev[pd_idx];
3045            s->locked++;
3046            set_bit(R5_LOCKED, &dev->flags);
3047            set_bit(R5_Wantwrite, &dev->flags);
3048        }
3049        if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
3050            dev = &sh->dev[qd_idx];
3051            s->locked++;
3052            set_bit(R5_LOCKED, &dev->flags);
3053            set_bit(R5_Wantwrite, &dev->flags);
3054        }
3055        clear_bit(STRIPE_DEGRADED, &sh->state);
3056
3057        set_bit(STRIPE_INSYNC, &sh->state);
3058        break;
3059    case check_state_run:
3060    case check_state_run_q:
3061    case check_state_run_pq:
3062        break; /* we will be called again upon completion */
3063    case check_state_check_result:
3064        sh->check_state = check_state_idle;
3065
3066        /* handle a successful check operation, if parity is correct
3067         * we are done. Otherwise update the mismatch count and repair
3068         * parity if !MD_RECOVERY_CHECK
3069         */
3070        if (sh->ops.zero_sum_result == 0) {
3071            /* both parities are correct */
3072            if (!s->failed)
3073                set_bit(STRIPE_INSYNC, &sh->state);
3074            else {
3075                /* in contrast to the raid5 case we can validate
3076                 * parity, but still have a failure to write
3077                 * back
3078                 */
3079                sh->check_state = check_state_compute_result;
3080                /* Returning at this point means that we may go
3081                 * off and bring p and/or q uptodate again so
3082                 * we make sure to check zero_sum_result again
3083                 * to verify if p or q need writeback
3084                 */
3085            }
3086        } else {
3087            conf->mddev->resync_mismatches += STRIPE_SECTORS;
3088            if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3089                /* don't try to repair!! */
3090                set_bit(STRIPE_INSYNC, &sh->state);
3091            else {
3092                int *target = &sh->ops.target;
3093
3094                sh->ops.target = -1;
3095                sh->ops.target2 = -1;
3096                sh->check_state = check_state_compute_run;
3097                set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3098                set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3099                if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
3100                    set_bit(R5_Wantcompute,
3101                        &sh->dev[pd_idx].flags);
3102                    *target = pd_idx;
3103                    target = &sh->ops.target2;
3104                    s->uptodate++;
3105                }
3106                if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
3107                    set_bit(R5_Wantcompute,
3108                        &sh->dev[qd_idx].flags);
3109                    *target = qd_idx;
3110                    s->uptodate++;
3111                }
3112            }
3113        }
3114        break;
3115    case check_state_compute_run:
3116        break;
3117    default:
3118        printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
3119               __func__, sh->check_state,
3120               (unsigned long long) sh->sector);
3121        BUG();
3122    }
3123}
3124
3125static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3126{
3127    int i;
3128
3129    /* We have read all the blocks in this stripe and now we need to
3130     * copy some of them into a target stripe for expand.
3131     */
3132    struct dma_async_tx_descriptor *tx = NULL;
3133    clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3134    for (i = 0; i < sh->disks; i++)
3135        if (i != sh->pd_idx && i != sh->qd_idx) {
3136            int dd_idx, j;
3137            struct stripe_head *sh2;
3138            struct async_submit_ctl submit;
3139
3140            sector_t bn = compute_blocknr(sh, i, 1);
3141            sector_t s = raid5_compute_sector(conf, bn, 0,
3142                              &dd_idx, NULL);
3143            sh2 = get_active_stripe(conf, s, 0, 1, 1);
3144            if (sh2 == NULL)
3145                /* so far only the early blocks of this stripe
3146                 * have been requested. When later blocks
3147                 * get requested, we will try again
3148                 */
3149                continue;
3150            if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
3151               test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
3152                /* must have already done this block */
3153                release_stripe(sh2);
3154                continue;
3155            }
3156
3157            /* place all the copies on one channel */
3158            init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
3159            tx = async_memcpy(sh2->dev[dd_idx].page,
3160                      sh->dev[i].page, 0, 0, STRIPE_SIZE,
3161                      &submit);
3162
3163            set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
3164            set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
3165            for (j = 0; j < conf->raid_disks; j++)
3166                if (j != sh2->pd_idx &&
3167                    j != sh2->qd_idx &&
3168                    !test_bit(R5_Expanded, &sh2->dev[j].flags))
3169                    break;
3170            if (j == conf->raid_disks) {
3171                set_bit(STRIPE_EXPAND_READY, &sh2->state);
3172                set_bit(STRIPE_HANDLE, &sh2->state);
3173            }
3174            release_stripe(sh2);
3175
3176        }
3177    /* done submitting copies, wait for them to complete */
3178    if (tx) {
3179        async_tx_ack(tx);
3180        dma_wait_for_async_tx(tx);
3181    }
3182}
3183
3184/*
3185 * handle_stripe - do things to a stripe.
3186 *
3187 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
3188 * state of various bits to see what needs to be done.
3189 * Possible results:
3190 * return some read requests which now have data
3191 * return some write requests which are safely on storage
3192 * schedule a read on some buffers
3193 * schedule a write of some buffers
3194 * return confirmation of parity correctness
3195 *
3196 */
3197
3198static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3199{
3200    struct r5conf *conf = sh->raid_conf;
3201    int disks = sh->disks;
3202    struct r5dev *dev;
3203    int i;
3204    int do_recovery = 0;
3205
3206    memset(s, 0, sizeof(*s));
3207
3208    s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3209    s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3210    s->failed_num[0] = -1;
3211    s->failed_num[1] = -1;
3212
3213    /* Now to look around and see what can be done */
3214    rcu_read_lock();
3215    for (i=disks; i--; ) {
3216        struct md_rdev *rdev;
3217        sector_t first_bad;
3218        int bad_sectors;
3219        int is_bad = 0;
3220
3221        dev = &sh->dev[i];
3222
3223        pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3224             i, dev->flags,
3225             dev->toread, dev->towrite, dev->written);
3226        /* maybe we can reply to a read
3227         *
3228         * new wantfill requests are only permitted while
3229         * ops_complete_biofill is guaranteed to be inactive
3230         */
3231        if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3232            !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3233            set_bit(R5_Wantfill, &dev->flags);
3234
3235        /* now count some things */
3236        if (test_bit(R5_LOCKED, &dev->flags))
3237            s->locked++;
3238        if (test_bit(R5_UPTODATE, &dev->flags))
3239            s->uptodate++;
3240        if (test_bit(R5_Wantcompute, &dev->flags)) {
3241            s->compute++;
3242            BUG_ON(s->compute > 2);
3243        }
3244
3245        if (test_bit(R5_Wantfill, &dev->flags))
3246            s->to_fill++;
3247        else if (dev->toread)
3248            s->to_read++;
3249        if (dev->towrite) {
3250            s->to_write++;
3251            if (!test_bit(R5_OVERWRITE, &dev->flags))
3252                s->non_overwrite++;
3253        }
3254        if (dev->written)
3255            s->written++;
3256        /* Prefer to use the replacement for reads, but only
3257         * if it is recovered enough and has no bad blocks.
3258         */
3259        rdev = rcu_dereference(conf->disks[i].replacement);
3260        if (rdev && !test_bit(Faulty, &rdev->flags) &&
3261            rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
3262            !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3263                 &first_bad, &bad_sectors))
3264            set_bit(R5_ReadRepl, &dev->flags);
3265        else {
3266            if (rdev)
3267                set_bit(R5_NeedReplace, &dev->flags);
3268            rdev = rcu_dereference(conf->disks[i].rdev);
3269            clear_bit(R5_ReadRepl, &dev->flags);
3270        }
3271        if (rdev && test_bit(Faulty, &rdev->flags))
3272            rdev = NULL;
3273        if (rdev) {
3274            is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3275                         &first_bad, &bad_sectors);
3276            if (s->blocked_rdev == NULL
3277                && (test_bit(Blocked, &rdev->flags)
3278                || is_bad < 0)) {
3279                if (is_bad < 0)
3280                    set_bit(BlockedBadBlocks,
3281                        &rdev->flags);
3282                s->blocked_rdev = rdev;
3283                atomic_inc(&rdev->nr_pending);
3284            }
3285        }
3286        clear_bit(R5_Insync, &dev->flags);
3287        if (!rdev)
3288            /* Not in-sync */;
3289        else if (is_bad) {
3290            /* also not in-sync */
3291            if (!test_bit(WriteErrorSeen, &rdev->flags) &&
3292                test_bit(R5_UPTODATE, &dev->flags)) {
3293                /* treat as in-sync, but with a read error
3294                 * which we can now try to correct
3295                 */
3296                set_bit(R5_Insync, &dev->flags);
3297                set_bit(R5_ReadError, &dev->flags);
3298            }
3299        } else if (test_bit(In_sync, &rdev->flags))
3300            set_bit(R5_Insync, &dev->flags);
3301        else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3302            /* in sync if before recovery_offset */
3303            set_bit(R5_Insync, &dev->flags);
3304        else if (test_bit(R5_UPTODATE, &dev->flags) &&
3305             test_bit(R5_Expanded, &dev->flags))
3306            /* If we've reshaped into here, we assume it is Insync.
3307             * We will shortly update recovery_offset to make
3308             * it official.
3309             */
3310            set_bit(R5_Insync, &dev->flags);
3311
3312        if (rdev && test_bit(R5_WriteError, &dev->flags)) {
3313            /* This flag does not apply to '.replacement'
3314             * only to .rdev, so make sure to check that*/
3315            struct md_rdev *rdev2 = rcu_dereference(
3316                conf->disks[i].rdev);
3317            if (rdev2 == rdev)
3318                clear_bit(R5_Insync, &dev->flags);
3319            if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3320                s->handle_bad_blocks = 1;
3321                atomic_inc(&rdev2->nr_pending);
3322            } else
3323                clear_bit(R5_WriteError, &dev->flags);
3324        }
3325        if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
3326            /* This flag does not apply to '.replacement'
3327             * only to .rdev, so make sure to check that*/
3328            struct md_rdev *rdev2 = rcu_dereference(
3329                conf->disks[i].rdev);
3330            if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3331                s->handle_bad_blocks = 1;
3332                atomic_inc(&rdev2->nr_pending);
3333            } else
3334                clear_bit(R5_MadeGood, &dev->flags);
3335        }
3336        if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
3337            struct md_rdev *rdev2 = rcu_dereference(
3338                conf->disks[i].replacement);
3339            if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
3340                s->handle_bad_blocks = 1;
3341                atomic_inc(&rdev2->nr_pending);
3342            } else
3343                clear_bit(R5_MadeGoodRepl, &dev->flags);
3344        }
3345        if (!test_bit(R5_Insync, &dev->flags)) {
3346            /* The ReadError flag will just be confusing now */
3347            clear_bit(R5_ReadError, &dev->flags);
3348            clear_bit(R5_ReWrite, &dev->flags);
3349        }
3350        if (test_bit(R5_ReadError, &dev->flags))
3351            clear_bit(R5_Insync, &dev->flags);
3352        if (!test_bit(R5_Insync, &dev->flags)) {
3353            if (s->failed < 2)
3354                s->failed_num[s->failed] = i;
3355            s->failed++;
3356            if (rdev && !test_bit(Faulty, &rdev->flags))
3357                do_recovery = 1;
3358        }
3359    }
3360    if (test_bit(STRIPE_SYNCING, &sh->state)) {
3361        /* If there is a failed device being replaced,
3362         * we must be recovering.
3363         * else if we are after recovery_cp, we must be syncing
3364         * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
3365         * else we can only be replacing
3366         * sync and recovery both need to read all devices, and so
3367         * use the same flag.
3368         */
3369        if (do_recovery ||
3370            sh->sector >= conf->mddev->recovery_cp ||
3371            test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
3372            s->syncing = 1;
3373        else
3374            s->replacing = 1;
3375    }
3376    rcu_read_unlock();
3377}
3378
3379static void handle_stripe(struct stripe_head *sh)
3380{
3381    struct stripe_head_state s;
3382    struct r5conf *conf = sh->raid_conf;
3383    int i;
3384    int prexor;
3385    int disks = sh->disks;
3386    struct r5dev *pdev, *qdev;
3387
3388    clear_bit(STRIPE_HANDLE, &sh->state);
3389    if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
3390        /* already being handled, ensure it gets handled
3391         * again when current action finishes */
3392        set_bit(STRIPE_HANDLE, &sh->state);
3393        return;
3394    }
3395
3396    if (test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
3397        set_bit(STRIPE_SYNCING, &sh->state);
3398        clear_bit(STRIPE_INSYNC, &sh->state);
3399    }
3400    clear_bit(STRIPE_DELAYED, &sh->state);
3401
3402    pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3403        "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3404           (unsigned long long)sh->sector, sh->state,
3405           atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
3406           sh->check_state, sh->reconstruct_state);
3407
3408    analyse_stripe(sh, &s);
3409
3410    if (s.handle_bad_blocks) {
3411        set_bit(STRIPE_HANDLE, &sh->state);
3412        goto finish;
3413    }
3414
3415    if (unlikely(s.blocked_rdev)) {
3416        if (s.syncing || s.expanding || s.expanded ||
3417            s.replacing || s.to_write || s.written) {
3418            set_bit(STRIPE_HANDLE, &sh->state);
3419            goto finish;
3420        }
3421        /* There is nothing for the blocked_rdev to block */
3422        rdev_dec_pending(s.blocked_rdev, conf->mddev);
3423        s.blocked_rdev = NULL;
3424    }
3425
3426    if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3427        set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3428        set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3429    }
3430
3431    pr_debug("locked=%d uptodate=%d to_read=%d"
3432           " to_write=%d failed=%d failed_num=%d,%d\n",
3433           s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3434           s.failed_num[0], s.failed_num[1]);
3435    /* check if the array has lost more than max_degraded devices and,
3436     * if so, some requests might need to be failed.
3437     */
3438    if (s.failed > conf->max_degraded) {
3439        sh->check_state = 0;
3440        sh->reconstruct_state = 0;
3441        if (s.to_read+s.to_write+s.written)
3442            handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
3443        if (s.syncing + s.replacing)
3444            handle_failed_sync(conf, sh, &s);
3445    }
3446
3447    /*
3448     * might be able to return some write requests if the parity blocks
3449     * are safe, or on a failed drive
3450     */
3451    pdev = &sh->dev[sh->pd_idx];
3452    s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
3453        || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
3454    qdev = &sh->dev[sh->qd_idx];
3455    s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
3456        || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
3457        || conf->level < 6;
3458
3459    if (s.written &&
3460        (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3461                 && !test_bit(R5_LOCKED, &pdev->flags)
3462                 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3463        (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3464                 && !test_bit(R5_LOCKED, &qdev->flags)
3465                 && test_bit(R5_UPTODATE, &qdev->flags)))))
3466        handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3467
3468    /* Now we might consider reading some blocks, either to check/generate
3469     * parity, or to satisfy requests
3470     * or to load a block that is being partially written.
3471     */
3472    if (s.to_read || s.non_overwrite
3473        || (conf->level == 6 && s.to_write && s.failed)
3474        || (s.syncing && (s.uptodate + s.compute < disks))
3475        || s.replacing
3476        || s.expanding)
3477        handle_stripe_fill(sh, &s, disks);
3478
3479    /* Now we check to see if any write operations have recently
3480     * completed
3481     */
3482    prexor = 0;
3483    if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3484        prexor = 1;
3485    if (sh->reconstruct_state == reconstruct_state_drain_result ||
3486        sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3487        sh->reconstruct_state = reconstruct_state_idle;
3488
3489        /* All the 'written' buffers and the parity block are ready to
3490         * be written back to disk
3491         */
3492        BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3493        BUG_ON(sh->qd_idx >= 0 &&
3494               !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags));
3495        for (i = disks; i--; ) {
3496            struct r5dev *dev = &sh->dev[i];
3497            if (test_bit(R5_LOCKED, &dev->flags) &&
3498                (i == sh->pd_idx || i == sh->qd_idx ||
3499                 dev->written)) {
3500                pr_debug("Writing block %d\n", i);
3501                set_bit(R5_Wantwrite, &dev->flags);
3502                if (prexor)
3503                    continue;
3504                if (!test_bit(R5_Insync, &dev->flags) ||
3505                    ((i == sh->pd_idx || i == sh->qd_idx) &&
3506                     s.failed == 0))
3507                    set_bit(STRIPE_INSYNC, &sh->state);
3508            }
3509        }
3510        if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3511            s.dec_preread_active = 1;
3512    }
3513
3514    /* Now to consider new write requests and what else, if anything
3515     * should be read. We do not handle new writes when:
3516     * 1/ A 'write' operation (copy+xor) is already in flight.
3517     * 2/ A 'check' operation is in flight, as it may clobber the parity
3518     * block.
3519     */
3520    if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3521        handle_stripe_dirtying(conf, sh, &s, disks);
3522
3523    /* maybe we need to check and possibly fix the parity for this stripe
3524     * Any reads will already have been scheduled, so we just see if enough
3525     * data is available. The parity check is held off while parity
3526     * dependent operations are in flight.
3527     */
3528    if (sh->check_state ||
3529        (s.syncing && s.locked == 0 &&
3530         !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3531         !test_bit(STRIPE_INSYNC, &sh->state))) {
3532        if (conf->level == 6)
3533            handle_parity_checks6(conf, sh, &s, disks);
3534        else
3535            handle_parity_checks5(conf, sh, &s, disks);
3536    }
3537
3538    if (s.replacing && s.locked == 0
3539        && !test_bit(STRIPE_INSYNC, &sh->state)) {
3540        /* Write out to replacement devices where possible */
3541        for (i = 0; i < conf->raid_disks; i++)
3542            if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
3543                test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
3544                set_bit(R5_WantReplace, &sh->dev[i].flags);
3545                set_bit(R5_LOCKED, &sh->dev[i].flags);
3546                s.locked++;
3547            }
3548        set_bit(STRIPE_INSYNC, &sh->state);
3549    }
3550    if ((s.syncing || s.replacing) && s.locked == 0 &&
3551        test_bit(STRIPE_INSYNC, &sh->state)) {
3552        md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3553        clear_bit(STRIPE_SYNCING, &sh->state);
3554    }
3555
3556    /* If the failed drives are just a ReadError, then we might need
3557     * to progress the repair/check process
3558     */
3559    if (s.failed <= conf->max_degraded && !conf->mddev->ro)
3560        for (i = 0; i < s.failed; i++) {
3561            struct r5dev *dev = &sh->dev[s.failed_num[i]];
3562            if (test_bit(R5_ReadError, &dev->flags)
3563                && !test_bit(R5_LOCKED, &dev->flags)
3564                && test_bit(R5_UPTODATE, &dev->flags)
3565                ) {
3566                if (!test_bit(R5_ReWrite, &dev->flags)) {
3567                    set_bit(R5_Wantwrite, &dev->flags);
3568                    set_bit(R5_ReWrite, &dev->flags);
3569                    set_bit(R5_LOCKED, &dev->flags);
3570                    s.locked++;
3571                } else {
3572                    /* let's read it back */
3573                    set_bit(R5_Wantread, &dev->flags);
3574                    set_bit(R5_LOCKED, &dev->flags);
3575                    s.locked++;
3576                }
3577            }
3578        }
3579
3580
3581    /* Finish reconstruct operations initiated by the expansion process */
3582    if (sh->reconstruct_state == reconstruct_state_result) {
3583        struct stripe_head *sh_src
3584            = get_active_stripe(conf, sh->sector, 1, 1, 1);
3585        if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
3586            /* sh cannot be written until sh_src has been read.
3587             * so arrange for sh to be delayed a little
3588             */
3589            set_bit(STRIPE_DELAYED, &sh->state);
3590            set_bit(STRIPE_HANDLE, &sh->state);
3591            if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3592                          &sh_src->state))
3593                atomic_inc(&conf->preread_active_stripes);
3594            release_stripe(sh_src);
3595            goto finish;
3596        }
3597        if (sh_src)
3598            release_stripe(sh_src);
3599
3600        sh->reconstruct_state = reconstruct_state_idle;
3601        clear_bit(STRIPE_EXPANDING, &sh->state);
3602        for (i = conf->raid_disks; i--; ) {
3603            set_bit(R5_Wantwrite, &sh->dev[i].flags);
3604            set_bit(R5_LOCKED, &sh->dev[i].flags);
3605            s.locked++;
3606        }
3607    }
3608
3609    if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3610        !sh->reconstruct_state) {
3611        /* Need to write out all blocks after computing parity */
3612        sh->disks = conf->raid_disks;
3613        stripe_set_idx(sh->sector, conf, 0, sh);
3614        schedule_reconstruction(sh, &s, 1, 1);
3615    } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3616        clear_bit(STRIPE_EXPAND_READY, &sh->state);
3617        atomic_dec(&conf->reshape_stripes);
3618        wake_up(&conf->wait_for_overlap);
3619        md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3620    }
3621
3622    if (s.expanding && s.locked == 0 &&
3623        !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3624        handle_stripe_expansion(conf, sh);
3625
3626finish:
3627    /* wait for this device to become unblocked */
3628    if (unlikely(s.blocked_rdev)) {
3629        if (conf->mddev->external)
3630            md_wait_for_blocked_rdev(s.blocked_rdev,
3631                         conf->mddev);
3632        else
3633            /* Internal metadata will immediately
3634             * be written by raid5d, so we don't
3635             * need to wait here.
3636             */
3637            rdev_dec_pending(s.blocked_rdev,
3638                     conf->mddev);
3639    }
3640
3641    if (s.handle_bad_blocks)
3642        for (i = disks; i--; ) {
3643            struct md_rdev *rdev;
3644            struct r5dev *dev = &sh->dev[i];
3645            if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
3646                /* We own a safe reference to the rdev */
3647                rdev = conf->disks[i].rdev;
3648                if (!rdev_set_badblocks(rdev, sh->sector,
3649                            STRIPE_SECTORS, 0))
3650                    md_error(conf->mddev, rdev);
3651                rdev_dec_pending(rdev, conf->mddev);
3652            }
3653            if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
3654                rdev = conf->disks[i].rdev;
3655                rdev_clear_badblocks(rdev, sh->sector,
3656                             STRIPE_SECTORS, 0);
3657                rdev_dec_pending(rdev, conf->mddev);
3658            }
3659            if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
3660                rdev = conf->disks[i].replacement;
3661                if (!rdev)
3662                    /* rdev have been moved down */
3663                    rdev = conf->disks[i].rdev;
3664                rdev_clear_badblocks(rdev, sh->sector,
3665                             STRIPE_SECTORS, 0);
3666                rdev_dec_pending(rdev, conf->mddev);
3667            }
3668        }
3669
3670    if (s.ops_request)
3671        raid_run_ops(sh, s.ops_request);
3672
3673    ops_run_io(sh, &s);
3674
3675    if (s.dec_preread_active) {
3676        /* We delay this until after ops_run_io so that if make_request
3677         * is waiting on a flush, it won't continue until the writes
3678         * have actually been submitted.
3679         */
3680        atomic_dec(&conf->preread_active_stripes);
3681        if (atomic_read(&conf->preread_active_stripes) <
3682            IO_THRESHOLD)
3683            md_wakeup_thread(conf->mddev->thread);
3684    }
3685
3686    return_io(s.return_bi);
3687
3688    clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
3689}
3690
3691static void raid5_activate_delayed(struct r5conf *conf)
3692{
3693    if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3694        while (!list_empty(&conf->delayed_list)) {
3695            struct list_head *l = conf->delayed_list.next;
3696            struct stripe_head *sh;
3697            sh = list_entry(l, struct stripe_head, lru);
3698            list_del_init(l);
3699            clear_bit(STRIPE_DELAYED, &sh->state);
3700            if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3701                atomic_inc(&conf->preread_active_stripes);
3702            list_add_tail(&sh->lru, &conf->hold_list);
3703        }
3704    }
3705}
3706
3707static void activate_bit_delay(struct r5conf *conf)
3708{
3709    /* device_lock is held */
3710    struct list_head head;
3711    list_add(&head, &conf->bitmap_list);
3712    list_del_init(&conf->bitmap_list);
3713    while (!list_empty(&head)) {
3714        struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3715        list_del_init(&sh->lru);
3716        atomic_inc(&sh->count);
3717        __release_stripe(conf, sh);
3718    }
3719}
3720
3721int md_raid5_congested(struct mddev *mddev, int bits)
3722{
3723    struct r5conf *conf = mddev->private;
3724
3725    /* No difference between reads and writes. Just check
3726     * how busy the stripe_cache is
3727     */
3728
3729    if (conf->inactive_blocked)
3730        return 1;
3731    if (conf->quiesce)
3732        return 1;
3733    if (list_empty_careful(&conf->inactive_list))
3734        return 1;
3735
3736    return 0;
3737}
3738EXPORT_SYMBOL_GPL(md_raid5_congested);
3739
3740static int raid5_congested(void *data, int bits)
3741{
3742    struct mddev *mddev = data;
3743
3744    return mddev_congested(mddev, bits) ||
3745        md_raid5_congested(mddev, bits);
3746}
3747
3748/* We want read requests to align with chunks where possible,
3749 * but write requests don't need to.
3750 */
3751static int raid5_mergeable_bvec(struct request_queue *q,
3752                struct bvec_merge_data *bvm,
3753                struct bio_vec *biovec)
3754{
3755    struct mddev *mddev = q->queuedata;
3756    sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3757    int max;
3758    unsigned int chunk_sectors = mddev->chunk_sectors;
3759    unsigned int bio_sectors = bvm->bi_size >> 9;
3760
3761    if ((bvm->bi_rw & 1) == WRITE)
3762        return biovec->bv_len; /* always allow writes to be mergeable */
3763
3764    if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3765        chunk_sectors = mddev->new_chunk_sectors;
3766    max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3767    if (max < 0) max = 0;
3768    if (max <= biovec->bv_len && bio_sectors == 0)
3769        return biovec->bv_len;
3770    else
3771        return max;
3772}
3773
3774
3775static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
3776{
3777    sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3778    unsigned int chunk_sectors = mddev->chunk_sectors;
3779    unsigned int bio_sectors = bio->bi_size >> 9;
3780
3781    if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3782        chunk_sectors = mddev->new_chunk_sectors;
3783    return chunk_sectors >=
3784        ((sector & (chunk_sectors - 1)) + bio_sectors);
3785}
3786
3787/*
3788 * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
3789 * later sampled by raid5d.
3790 */
3791static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
3792{
3793    unsigned long flags;
3794
3795    spin_lock_irqsave(&conf->device_lock, flags);
3796
3797    bi->bi_next = conf->retry_read_aligned_list;
3798    conf->retry_read_aligned_list = bi;
3799
3800    spin_unlock_irqrestore(&conf->device_lock, flags);
3801    md_wakeup_thread(conf->mddev->thread);
3802}
3803
3804
3805static struct bio *remove_bio_from_retry(struct r5conf *conf)
3806{
3807    struct bio *bi;
3808
3809    bi = conf->retry_read_aligned;
3810    if (bi) {
3811        conf->retry_read_aligned = NULL;
3812        return bi;
3813    }
3814    bi = conf->retry_read_aligned_list;
3815    if(bi) {
3816        conf->retry_read_aligned_list = bi->bi_next;
3817        bi->bi_next = NULL;
3818        /*
3819         * this sets the active strip count to 1 and the processed
3820         * strip count to zero (upper 8 bits)
3821         */
3822        raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
3823    }
3824
3825    return bi;
3826}
3827
3828
3829/*
3830 * The "raid5_align_endio" should check if the read succeeded and if it
3831 * did, call bio_endio on the original bio (having bio_put the new bio
3832 * first).
3833 * If the read failed..
3834 */
3835static void raid5_align_endio(struct bio *bi, int error)
3836{
3837    struct bio* raid_bi = bi->bi_private;
3838    struct mddev *mddev;
3839    struct r5conf *conf;
3840    int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3841    struct md_rdev *rdev;
3842
3843    bio_put(bi);
3844
3845    rdev = (void*)raid_bi->bi_next;
3846    raid_bi->bi_next = NULL;
3847    mddev = rdev->mddev;
3848    conf = mddev->private;
3849
3850    rdev_dec_pending(rdev, conf->mddev);
3851
3852    if (!error && uptodate) {
3853        bio_endio(raid_bi, 0);
3854        if (atomic_dec_and_test(&conf->active_aligned_reads))
3855            wake_up(&conf->wait_for_stripe);
3856        return;
3857    }
3858
3859
3860    pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3861
3862    add_bio_to_retry(raid_bi, conf);
3863}
3864
3865static int bio_fits_rdev(struct bio *bi)
3866{
3867    struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3868
3869    if ((bi->bi_size>>9) > queue_max_sectors(q))
3870        return 0;
3871    blk_recount_segments(q, bi);
3872    if (bi->bi_phys_segments > queue_max_segments(q))
3873        return 0;
3874
3875    if (q->merge_bvec_fn)
3876        /* it's too hard to apply the merge_bvec_fn at this stage,
3877         * just just give up
3878         */
3879        return 0;
3880
3881    return 1;
3882}
3883
3884
3885static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
3886{
3887    struct r5conf *conf = mddev->private;
3888    int dd_idx;
3889    struct bio* align_bi;
3890    struct md_rdev *rdev;
3891    sector_t end_sector;
3892
3893    if (!in_chunk_boundary(mddev, raid_bio)) {
3894        pr_debug("chunk_aligned_read : non aligned\n");
3895        return 0;
3896    }
3897    /*
3898     * use bio_clone_mddev to make a copy of the bio
3899     */
3900    align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
3901    if (!align_bi)
3902        return 0;
3903    /*
3904     * set bi_end_io to a new function, and set bi_private to the
3905     * original bio.
3906     */
3907    align_bi->bi_end_io = raid5_align_endio;
3908    align_bi->bi_private = raid_bio;
3909    /*
3910     * compute position
3911     */
3912    align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
3913                            0,
3914                            &dd_idx, NULL);
3915
3916    end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
3917    rcu_read_lock();
3918    rdev = rcu_dereference(conf->disks[dd_idx].replacement);
3919    if (!rdev || test_bit(Faulty, &rdev->flags) ||
3920        rdev->recovery_offset < end_sector) {
3921        rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3922        if (rdev &&
3923            (test_bit(Faulty, &rdev->flags) ||
3924            !(test_bit(In_sync, &rdev->flags) ||
3925              rdev->recovery_offset >= end_sector)))
3926            rdev = NULL;
3927    }
3928    if (rdev) {
3929        sector_t first_bad;
3930        int bad_sectors;
3931
3932        atomic_inc(&rdev->nr_pending);
3933        rcu_read_unlock();
3934        raid_bio->bi_next = (void*)rdev;
3935        align_bi->bi_bdev = rdev->bdev;
3936        align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3937
3938        if (!bio_fits_rdev(align_bi) ||
3939            is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3940                &first_bad, &bad_sectors)) {
3941            /* too big in some way, or has a known bad block */
3942            bio_put(align_bi);
3943            rdev_dec_pending(rdev, mddev);
3944            return 0;
3945        }
3946
3947        /* No reshape active, so we can trust rdev->data_offset */
3948        align_bi->bi_sector += rdev->data_offset;
3949
3950        spin_lock_irq(&conf->device_lock);
3951        wait_event_lock_irq(conf->wait_for_stripe,
3952                    conf->quiesce == 0,
3953                    conf->device_lock, /* nothing */);
3954        atomic_inc(&conf->active_aligned_reads);
3955        spin_unlock_irq(&conf->device_lock);
3956
3957        generic_make_request(align_bi);
3958        return 1;
3959    } else {
3960        rcu_read_unlock();
3961        bio_put(align_bi);
3962        return 0;
3963    }
3964}
3965
3966/* __get_priority_stripe - get the next stripe to process
3967 *
3968 * Full stripe writes are allowed to pass preread active stripes up until
3969 * the bypass_threshold is exceeded. In general the bypass_count
3970 * increments when the handle_list is handled before the hold_list; however, it
3971 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3972 * stripe with in flight i/o. The bypass_count will be reset when the
3973 * head of the hold_list has changed, i.e. the head was promoted to the
3974 * handle_list.
3975 */
3976static struct stripe_head *__get_priority_stripe(struct r5conf *conf)
3977{
3978    struct stripe_head *sh;
3979
3980    pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3981          __func__,
3982          list_empty(&conf->handle_list) ? "empty" : "busy",
3983          list_empty(&conf->hold_list) ? "empty" : "busy",
3984          atomic_read(&conf->pending_full_writes), conf->bypass_count);
3985
3986    if (!list_empty(&conf->handle_list)) {
3987        sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3988
3989        if (list_empty(&conf->hold_list))
3990            conf->bypass_count = 0;
3991        else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3992            if (conf->hold_list.next == conf->last_hold)
3993                conf->bypass_count++;
3994            else {
3995                conf->last_hold = conf->hold_list.next;
3996                conf->bypass_count -= conf->bypass_threshold;
3997                if (conf->bypass_count < 0)
3998                    conf->bypass_count = 0;
3999            }
4000        }
4001    } else if (!list_empty(&conf->hold_list) &&
4002           ((conf->bypass_threshold &&
4003             conf->bypass_count > conf->bypass_threshold) ||
4004            atomic_read(&conf->pending_full_writes) == 0)) {
4005        sh = list_entry(conf->hold_list.next,
4006                typeof(*sh), lru);
4007        conf->bypass_count -= conf->bypass_threshold;
4008        if (conf->bypass_count < 0)
4009            conf->bypass_count = 0;
4010    } else
4011        return NULL;
4012
4013    list_del_init(&sh->lru);
4014    atomic_inc(&sh->count);
4015    BUG_ON(atomic_read(&sh->count) != 1);
4016    return sh;
4017}
4018
4019struct raid5_plug_cb {
4020    struct blk_plug_cb cb;
4021    struct list_head list;
4022};
4023
4024static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4025{
4026    struct raid5_plug_cb *cb = container_of(
4027        blk_cb, struct raid5_plug_cb, cb);
4028    struct stripe_head *sh;
4029    struct mddev *mddev = cb->cb.data;
4030    struct r5conf *conf = mddev->private;
4031
4032    if (cb->list.next && !list_empty(&cb->list)) {
4033        spin_lock_irq(&conf->device_lock);
4034        while (!list_empty(&cb->list)) {
4035            sh = list_first_entry(&cb->list, struct stripe_head, lru);
4036            list_del_init(&sh->lru);
4037            /*
4038             * avoid race release_stripe_plug() sees
4039             * STRIPE_ON_UNPLUG_LIST clear but the stripe
4040             * is still in our list
4041             */
4042            smp_mb__before_clear_bit();
4043            clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
4044            __release_stripe(conf, sh);
4045        }
4046        spin_unlock_irq(&conf->device_lock);
4047    }
4048    kfree(cb);
4049}
4050
4051static void release_stripe_plug(struct mddev *mddev,
4052                struct stripe_head *sh)
4053{
4054    struct blk_plug_cb *blk_cb = blk_check_plugged(
4055        raid5_unplug, mddev,
4056        sizeof(struct raid5_plug_cb));
4057    struct raid5_plug_cb *cb;
4058
4059    if (!blk_cb) {
4060        release_stripe(sh);
4061        return;
4062    }
4063
4064    cb = container_of(blk_cb, struct raid5_plug_cb, cb);
4065
4066    if (cb->list.next == NULL)
4067        INIT_LIST_HEAD(&cb->list);
4068
4069    if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
4070        list_add_tail(&sh->lru, &cb->list);
4071    else
4072        release_stripe(sh);
4073}
4074
4075static void make_request(struct mddev *mddev, struct bio * bi)
4076{
4077    struct r5conf *conf = mddev->private;
4078    int dd_idx;
4079    sector_t new_sector;
4080    sector_t logical_sector, last_sector;
4081    struct stripe_head *sh;
4082    const int rw = bio_data_dir(bi);
4083    int remaining;
4084
4085    if (unlikely(bi->bi_rw & REQ_FLUSH)) {
4086        md_flush_request(mddev, bi);
4087        return;
4088    }
4089
4090    md_write_start(mddev, bi);
4091
4092    if (rw == READ &&
4093         mddev->reshape_position == MaxSector &&
4094         chunk_aligned_read(mddev,bi))
4095        return;
4096
4097    logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4098    last_sector = bi->bi_sector + (bi->bi_size>>9);
4099    bi->bi_next = NULL;
4100    bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4101
4102    for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
4103        DEFINE_WAIT(w);
4104        int previous;
4105
4106    retry:
4107        previous = 0;
4108        prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
4109        if (unlikely(conf->reshape_progress != MaxSector)) {
4110            /* spinlock is needed as reshape_progress may be
4111             * 64bit on a 32bit platform, and so it might be
4112             * possible to see a half-updated value
4113             * Of course reshape_progress could change after
4114             * the lock is dropped, so once we get a reference
4115             * to the stripe that we think it is, we will have
4116             * to check again.
4117             */
4118            spin_lock_irq(&conf->device_lock);
4119            if (mddev->reshape_backwards
4120                ? logical_sector < conf->reshape_progress
4121                : logical_sector >= conf->reshape_progress) {
4122                previous = 1;
4123            } else {
4124                if (mddev->reshape_backwards
4125                    ? logical_sector < conf->reshape_safe
4126                    : logical_sector >= conf->reshape_safe) {
4127                    spin_unlock_irq(&conf->device_lock);
4128                    schedule();
4129                    goto retry;
4130                }
4131            }
4132            spin_unlock_irq(&conf->device_lock);
4133        }
4134
4135        new_sector = raid5_compute_sector(conf, logical_sector,
4136                          previous,
4137                          &dd_idx, NULL);
4138        pr_debug("raid456: make_request, sector %llu logical %llu\n",
4139            (unsigned long long)new_sector,
4140            (unsigned long long)logical_sector);
4141
4142        sh = get_active_stripe(conf, new_sector, previous,
4143                       (bi->bi_rw&RWA_MASK), 0);
4144        if (sh) {
4145            if (unlikely(previous)) {
4146                /* expansion might have moved on while waiting for a
4147                 * stripe, so we must do the range check again.
4148                 * Expansion could still move past after this
4149                 * test, but as we are holding a reference to
4150                 * 'sh', we know that if that happens,
4151                 * STRIPE_EXPANDING will get set and the expansion
4152                 * won't proceed until we finish with the stripe.
4153                 */
4154                int must_retry = 0;
4155                spin_lock_irq(&conf->device_lock);
4156                if (mddev->reshape_backwards
4157                    ? logical_sector >= conf->reshape_progress
4158                    : logical_sector < conf->reshape_progress)
4159                    /* mismatch, need to try again */
4160                    must_retry = 1;
4161                spin_unlock_irq(&conf->device_lock);
4162                if (must_retry) {
4163                    release_stripe(sh);
4164                    schedule();
4165                    goto retry;
4166                }
4167            }
4168
4169            if (rw == WRITE &&
4170                logical_sector >= mddev->suspend_lo &&
4171                logical_sector < mddev->suspend_hi) {
4172                release_stripe(sh);
4173                /* As the suspend_* range is controlled by
4174                 * userspace, we want an interruptible
4175                 * wait.
4176                 */
4177                flush_signals(current);
4178                prepare_to_wait(&conf->wait_for_overlap,
4179                        &w, TASK_INTERRUPTIBLE);
4180                if (logical_sector >= mddev->suspend_lo &&
4181                    logical_sector < mddev->suspend_hi)
4182                    schedule();
4183                goto retry;
4184            }
4185
4186            if (test_bit(STRIPE_EXPANDING, &sh->state) ||
4187                !add_stripe_bio(sh, bi, dd_idx, rw)) {
4188                /* Stripe is busy expanding or
4189                 * add failed due to overlap. Flush everything
4190                 * and wait a while
4191                 */
4192                md_wakeup_thread(mddev->thread);
4193                release_stripe(sh);
4194                schedule();
4195                goto retry;
4196            }
4197            finish_wait(&conf->wait_for_overlap, &w);
4198            set_bit(STRIPE_HANDLE, &sh->state);
4199            clear_bit(STRIPE_DELAYED, &sh->state);
4200            if ((bi->bi_rw & REQ_SYNC) &&
4201                !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4202                atomic_inc(&conf->preread_active_stripes);
4203            release_stripe_plug(mddev, sh);
4204        } else {
4205            /* cannot get stripe for read-ahead, just give-up */
4206            clear_bit(BIO_UPTODATE, &bi->bi_flags);
4207            finish_wait(&conf->wait_for_overlap, &w);
4208            break;
4209        }
4210    }
4211
4212    remaining = raid5_dec_bi_active_stripes(bi);
4213    if (remaining == 0) {
4214
4215        if ( rw == WRITE )
4216            md_write_end(mddev);
4217
4218        bio_endio(bi, 0);
4219    }
4220}
4221
4222static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
4223
4224static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
4225{
4226    /* reshaping is quite different to recovery/resync so it is
4227     * handled quite separately ... here.
4228     *
4229     * On each call to sync_request, we gather one chunk worth of
4230     * destination stripes and flag them as expanding.
4231     * Then we find all the source stripes and request reads.
4232     * As the reads complete, handle_stripe will copy the data
4233     * into the destination stripe and release that stripe.
4234     */
4235    struct r5conf *conf = mddev->private;
4236    struct stripe_head *sh;
4237    sector_t first_sector, last_sector;
4238    int raid_disks = conf->previous_raid_disks;
4239    int data_disks = raid_disks - conf->max_degraded;
4240    int new_data_disks = conf->raid_disks - conf->max_degraded;
4241    int i;
4242    int dd_idx;
4243    sector_t writepos, readpos, safepos;
4244    sector_t stripe_addr;
4245    int reshape_sectors;
4246    struct list_head stripes;
4247
4248    if (sector_nr == 0) {
4249        /* If restarting in the middle, skip the initial sectors */
4250        if (mddev->reshape_backwards &&
4251            conf->reshape_progress < raid5_size(mddev, 0, 0)) {
4252            sector_nr = raid5_size(mddev, 0, 0)
4253                - conf->reshape_progress;
4254        } else if (!mddev->reshape_backwards &&
4255               conf->reshape_progress > 0)
4256            sector_nr = conf->reshape_progress;
4257        sector_div(sector_nr, new_data_disks);
4258        if (sector_nr) {
4259            mddev->curr_resync_completed = sector_nr;
4260            sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4261            *skipped = 1;
4262            return sector_nr;
4263        }
4264    }
4265
4266    /* We need to process a full chunk at a time.
4267     * If old and new chunk sizes differ, we need to process the
4268     * largest of these
4269     */
4270    if (mddev->new_chunk_sectors > mddev->chunk_sectors)
4271        reshape_sectors = mddev->new_chunk_sectors;
4272    else
4273        reshape_sectors = mddev->chunk_sectors;
4274
4275    /* We update the metadata at least every 10 seconds, or when
4276     * the data about to be copied would over-write the source of
4277     * the data at the front of the range. i.e. one new_stripe
4278     * along from reshape_progress new_maps to after where
4279     * reshape_safe old_maps to
4280     */
4281    writepos = conf->reshape_progress;
4282    sector_div(writepos, new_data_disks);
4283    readpos = conf->reshape_progress;
4284    sector_div(readpos, data_disks);
4285    safepos = conf->reshape_safe;
4286    sector_div(safepos, data_disks);
4287    if (mddev->reshape_backwards) {
4288        writepos -= min_t(sector_t, reshape_sectors, writepos);
4289        readpos += reshape_sectors;
4290        safepos += reshape_sectors;
4291    } else {
4292        writepos += reshape_sectors;
4293        readpos -= min_t(sector_t, reshape_sectors, readpos);
4294        safepos -= min_t(sector_t, reshape_sectors, safepos);
4295    }
4296
4297    /* Having calculated the 'writepos' possibly use it
4298     * to set 'stripe_addr' which is where we will write to.
4299     */
4300    if (mddev->reshape_backwards) {
4301        BUG_ON(conf->reshape_progress == 0);
4302        stripe_addr = writepos;
4303        BUG_ON((mddev->dev_sectors &
4304            ~((sector_t)reshape_sectors - 1))
4305               - reshape_sectors - stripe_addr
4306               != sector_nr);
4307    } else {
4308        BUG_ON(writepos != sector_nr + reshape_sectors);
4309        stripe_addr = sector_nr;
4310    }
4311
4312    /* 'writepos' is the most advanced device address we might write.
4313     * 'readpos' is the least advanced device address we might read.
4314     * 'safepos' is the least address recorded in the metadata as having
4315     * been reshaped.
4316     * If there is a min_offset_diff, these are adjusted either by
4317     * increasing the safepos/readpos if diff is negative, or
4318     * increasing writepos if diff is positive.
4319     * If 'readpos' is then behind 'writepos', there is no way that we can
4320     * ensure safety in the face of a crash - that must be done by userspace
4321     * making a backup of the data. So in that case there is no particular
4322     * rush to update metadata.
4323     * Otherwise if 'safepos' is behind 'writepos', then we really need to
4324     * update the metadata to advance 'safepos' to match 'readpos' so that
4325     * we can be safe in the event of a crash.
4326     * So we insist on updating metadata if safepos is behind writepos and
4327     * readpos is beyond writepos.
4328     * In any case, update the metadata every 10 seconds.
4329     * Maybe that number should be configurable, but I'm not sure it is
4330     * worth it.... maybe it could be a multiple of safemode_delay???
4331     */
4332    if (conf->min_offset_diff < 0) {
4333        safepos += -conf->min_offset_diff;
4334        readpos += -conf->min_offset_diff;
4335    } else
4336        writepos += conf->min_offset_diff;
4337
4338    if ((mddev->reshape_backwards
4339         ? (safepos > writepos && readpos < writepos)
4340         : (safepos < writepos && readpos > writepos)) ||
4341        time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4342        /* Cannot proceed until we've updated the superblock... */
4343        wait_event(conf->wait_for_overlap,
4344               atomic_read(&conf->reshape_stripes)==0);
4345        mddev->reshape_position = conf->reshape_progress;
4346        mddev->curr_resync_completed = sector_nr;
4347        conf->reshape_checkpoint = jiffies;
4348        set_bit(MD_CHANGE_DEVS, &mddev->flags);
4349        md_wakeup_thread(mddev->thread);
4350        wait_event(mddev->sb_wait, mddev->flags == 0 ||
4351               kthread_should_stop());
4352        spin_lock_irq(&conf->device_lock);
4353        conf->reshape_safe = mddev->reshape_position;
4354        spin_unlock_irq(&conf->device_lock);
4355        wake_up(&conf->wait_for_overlap);
4356        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4357    }
4358
4359    INIT_LIST_HEAD(&stripes);
4360    for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4361        int j;
4362        int skipped_disk = 0;
4363        sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
4364        set_bit(STRIPE_EXPANDING, &sh->state);
4365        atomic_inc(&conf->reshape_stripes);
4366        /* If any of this stripe is beyond the end of the old
4367         * array, then we need to zero those blocks
4368         */
4369        for (j=sh->disks; j--;) {
4370            sector_t s;
4371            if (j == sh->pd_idx)
4372                continue;
4373            if (conf->level == 6 &&
4374                j == sh->qd_idx)
4375                continue;
4376            s = compute_blocknr(sh, j, 0);
4377            if (s < raid5_size(mddev, 0, 0)) {
4378                skipped_disk = 1;
4379                continue;
4380            }
4381            memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
4382            set_bit(R5_Expanded, &sh->dev[j].flags);
4383            set_bit(R5_UPTODATE, &sh->dev[j].flags);
4384        }
4385        if (!skipped_disk) {
4386            set_bit(STRIPE_EXPAND_READY, &sh->state);
4387            set_bit(STRIPE_HANDLE, &sh->state);
4388        }
4389        list_add(&sh->lru, &stripes);
4390    }
4391    spin_lock_irq(&conf->device_lock);
4392    if (mddev->reshape_backwards)
4393        conf->reshape_progress -= reshape_sectors * new_data_disks;
4394    else
4395        conf->reshape_progress += reshape_sectors * new_data_disks;
4396    spin_unlock_irq(&conf->device_lock);
4397    /* Ok, those stripe are ready. We can start scheduling
4398     * reads on the source stripes.
4399     * The source stripes are determined by mapping the first and last
4400     * block on the destination stripes.
4401     */
4402    first_sector =
4403        raid5_compute_sector(conf, stripe_addr*(new_data_disks),
4404                     1, &dd_idx, NULL);
4405    last_sector =
4406        raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
4407                        * new_data_disks - 1),
4408                     1, &dd_idx, NULL);
4409    if (last_sector >= mddev->dev_sectors)
4410        last_sector = mddev->dev_sectors - 1;
4411    while (first_sector <= last_sector) {
4412        sh = get_active_stripe(conf, first_sector, 1, 0, 1);
4413        set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4414        set_bit(STRIPE_HANDLE, &sh->state);
4415        release_stripe(sh);
4416        first_sector += STRIPE_SECTORS;
4417    }
4418    /* Now that the sources are clearly marked, we can release
4419     * the destination stripes
4420     */
4421    while (!list_empty(&stripes)) {
4422        sh = list_entry(stripes.next, struct stripe_head, lru);
4423        list_del_init(&sh->lru);
4424        release_stripe(sh);
4425    }
4426    /* If this takes us to the resync_max point where we have to pause,
4427     * then we need to write out the superblock.
4428     */
4429    sector_nr += reshape_sectors;
4430    if ((sector_nr - mddev->curr_resync_completed) * 2
4431        >= mddev->resync_max - mddev->curr_resync_completed) {
4432        /* Cannot proceed until we've updated the superblock... */
4433        wait_event(conf->wait_for_overlap,
4434               atomic_read(&conf->reshape_stripes) == 0);
4435        mddev->reshape_position = conf->reshape_progress;
4436        mddev->curr_resync_completed = sector_nr;
4437        conf->reshape_checkpoint = jiffies;
4438        set_bit(MD_CHANGE_DEVS, &mddev->flags);
4439        md_wakeup_thread(mddev->thread);
4440        wait_event(mddev->sb_wait,
4441               !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4442               || kthread_should_stop());
4443        spin_lock_irq(&conf->device_lock);
4444        conf->reshape_safe = mddev->reshape_position;
4445        spin_unlock_irq(&conf->device_lock);
4446        wake_up(&conf->wait_for_overlap);
4447        sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4448    }
4449    return reshape_sectors;
4450}
4451
4452/* FIXME go_faster isn't used */
4453static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
4454{
4455    struct r5conf *conf = mddev->private;
4456    struct stripe_head *sh;
4457    sector_t max_sector = mddev->dev_sectors;
4458    sector_t sync_blocks;
4459    int still_degraded = 0;
4460    int i;
4461
4462    if (sector_nr >= max_sector) {
4463        /* just being told to finish up .. nothing much to do */
4464
4465        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4466            end_reshape(conf);
4467            return 0;
4468        }
4469
4470        if (mddev->curr_resync < max_sector) /* aborted */
4471            bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
4472                    &sync_blocks, 1);
4473        else /* completed sync */
4474            conf->fullsync = 0;
4475        bitmap_close_sync(mddev->bitmap);
4476
4477        return 0;
4478    }
4479
4480    /* Allow raid5_quiesce to complete */
4481    wait_event(conf->wait_for_overlap, conf->quiesce != 2);
4482
4483    if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4484        return reshape_request(mddev, sector_nr, skipped);
4485
4486    /* No need to check resync_max as we never do more than one
4487     * stripe, and as resync_max will always be on a chunk boundary,
4488     * if the check in md_do_sync didn't fire, there is no chance
4489     * of overstepping resync_max here
4490     */
4491
4492    /* if there is too many failed drives and we are trying
4493     * to resync, then assert that we are finished, because there is
4494     * nothing we can do.
4495     */
4496    if (mddev->degraded >= conf->max_degraded &&
4497        test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4498        sector_t rv = mddev->dev_sectors - sector_nr;
4499        *skipped = 1;
4500        return rv;
4501    }
4502    if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
4503        !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
4504        !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
4505        /* we can skip this block, and probably more */
4506        sync_blocks /= STRIPE_SECTORS;
4507        *skipped = 1;
4508        return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4509    }
4510
4511    bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4512
4513    sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
4514    if (sh == NULL) {
4515        sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
4516        /* make sure we don't swamp the stripe cache if someone else
4517         * is trying to get access
4518         */
4519        schedule_timeout_uninterruptible(1);
4520    }
4521    /* Need to check if array will still be degraded after recovery/resync
4522     * We don't need to check the 'failed' flag as when that gets set,
4523     * recovery aborts.
4524     */
4525    for (i = 0; i < conf->raid_disks; i++)
4526        if (conf->disks[i].rdev == NULL)
4527            still_degraded = 1;
4528
4529    bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4530
4531    set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
4532
4533    handle_stripe(sh);
4534    release_stripe(sh);
4535
4536    return STRIPE_SECTORS;
4537}
4538
4539static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4540{
4541    /* We may not be able to submit a whole bio at once as there
4542     * may not be enough stripe_heads available.
4543     * We cannot pre-allocate enough stripe_heads as we may need
4544     * more than exist in the cache (if we allow ever large chunks).
4545     * So we do one stripe head at a time and record in
4546     * ->bi_hw_segments how many have been done.
4547     *
4548     * We *know* that this entire raid_bio is in one chunk, so
4549     * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
4550     */
4551    struct stripe_head *sh;
4552    int dd_idx;
4553    sector_t sector, logical_sector, last_sector;
4554    int scnt = 0;
4555    int remaining;
4556    int handled = 0;
4557
4558    logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4559    sector = raid5_compute_sector(conf, logical_sector,
4560                      0, &dd_idx, NULL);
4561    last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
4562
4563    for (; logical_sector < last_sector;
4564         logical_sector += STRIPE_SECTORS,
4565             sector += STRIPE_SECTORS,
4566             scnt++) {
4567
4568        if (scnt < raid5_bi_processed_stripes(raid_bio))
4569            /* already done this stripe */
4570            continue;
4571
4572        sh = get_active_stripe(conf, sector, 0, 1, 0);
4573
4574        if (!sh) {
4575            /* failed to get a stripe - must wait */
4576            raid5_set_bi_processed_stripes(raid_bio, scnt);
4577            conf->retry_read_aligned = raid_bio;
4578            return handled;
4579        }
4580
4581        if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4582            release_stripe(sh);
4583            raid5_set_bi_processed_stripes(raid_bio, scnt);
4584            conf->retry_read_aligned = raid_bio;
4585            return handled;
4586        }
4587
4588        set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
4589        handle_stripe(sh);
4590        release_stripe(sh);
4591        handled++;
4592    }
4593    remaining = raid5_dec_bi_active_stripes(raid_bio);
4594    if (remaining == 0)
4595        bio_endio(raid_bio, 0);
4596    if (atomic_dec_and_test(&conf->active_aligned_reads))
4597        wake_up(&conf->wait_for_stripe);
4598    return handled;
4599}
4600
4601#define MAX_STRIPE_BATCH 8
4602static int handle_active_stripes(struct r5conf *conf)
4603{
4604    struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
4605    int i, batch_size = 0;
4606
4607    while (batch_size < MAX_STRIPE_BATCH &&
4608            (sh = __get_priority_stripe(conf)) != NULL)
4609        batch[batch_size++] = sh;
4610
4611    if (batch_size == 0)
4612        return batch_size;
4613    spin_unlock_irq(&conf->device_lock);
4614
4615    for (i = 0; i < batch_size; i++)
4616        handle_stripe(batch[i]);
4617
4618    cond_resched();
4619
4620    spin_lock_irq(&conf->device_lock);
4621    for (i = 0; i < batch_size; i++)
4622        __release_stripe(conf, batch[i]);
4623    return batch_size;
4624}
4625
4626/*
4627 * This is our raid5 kernel thread.
4628 *
4629 * We scan the hash table for stripes which can be handled now.
4630 * During the scan, completed stripes are saved for us by the interrupt
4631 * handler, so that they will not have to wait for our next wakeup.
4632 */
4633static void raid5d(struct mddev *mddev)
4634{
4635    struct r5conf *conf = mddev->private;
4636    int handled;
4637    struct blk_plug plug;
4638
4639    pr_debug("+++ raid5d active\n");
4640
4641    md_check_recovery(mddev);
4642
4643    blk_start_plug(&plug);
4644    handled = 0;
4645    spin_lock_irq(&conf->device_lock);
4646    while (1) {
4647        struct bio *bio;
4648        int batch_size;
4649
4650        if (
4651            !list_empty(&conf->bitmap_list)) {
4652            /* Now is a good time to flush some bitmap updates */
4653            conf->seq_flush++;
4654            spin_unlock_irq(&conf->device_lock);
4655            bitmap_unplug(mddev->bitmap);
4656            spin_lock_irq(&conf->device_lock);
4657            conf->seq_write = conf->seq_flush;
4658            activate_bit_delay(conf);
4659        }
4660        raid5_activate_delayed(conf);
4661
4662        while ((bio = remove_bio_from_retry(conf))) {
4663            int ok;
4664            spin_unlock_irq(&conf->device_lock);
4665            ok = retry_aligned_read(conf, bio);
4666            spin_lock_irq(&conf->device_lock);
4667            if (!ok)
4668                break;
4669            handled++;
4670        }
4671
4672        batch_size = handle_active_stripes(conf);
4673        if (!batch_size)
4674            break;
4675        handled += batch_size;
4676
4677        if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
4678            spin_unlock_irq(&conf->device_lock);
4679            md_check_recovery(mddev);
4680            spin_lock_irq(&conf->device_lock);
4681        }
4682    }
4683    pr_debug("%d stripes handled\n", handled);
4684
4685    spin_unlock_irq(&conf->device_lock);
4686
4687    async_tx_issue_pending_all();
4688    blk_finish_plug(&plug);
4689
4690    pr_debug("--- raid5d inactive\n");
4691}
4692
4693static ssize_t
4694raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
4695{
4696    struct r5conf *conf = mddev->private;
4697    if (conf)
4698        return sprintf(page, "%d\n", conf->max_nr_stripes);
4699    else
4700        return 0;
4701}
4702
4703int
4704raid5_set_cache_size(struct mddev *mddev, int size)
4705{
4706    struct r5conf *conf = mddev->private;
4707    int err;
4708
4709    if (size <= 16 || size > 32768)
4710        return -EINVAL;
4711    while (size < conf->max_nr_stripes) {
4712        if (drop_one_stripe(conf))
4713            conf->max_nr_stripes--;
4714        else
4715            break;
4716    }
4717    err = md_allow_write(mddev);
4718    if (err)
4719        return err;
4720    while (size > conf->max_nr_stripes) {
4721        if (grow_one_stripe(conf))
4722            conf->max_nr_stripes++;
4723        else break;
4724    }
4725    return 0;
4726}
4727EXPORT_SYMBOL(raid5_set_cache_size);
4728
4729static ssize_t
4730raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
4731{
4732    struct r5conf *conf = mddev->private;
4733    unsigned long new;
4734    int err;
4735
4736    if (len >= PAGE_SIZE)
4737        return -EINVAL;
4738    if (!conf)
4739        return -ENODEV;
4740
4741    if (strict_strtoul(page, 10, &new))
4742        return -EINVAL;
4743    err = raid5_set_cache_size(mddev, new);
4744    if (err)
4745        return err;
4746    return len;
4747}
4748
4749static struct md_sysfs_entry
4750raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4751                raid5_show_stripe_cache_size,
4752                raid5_store_stripe_cache_size);
4753
4754static ssize_t
4755raid5_show_preread_threshold(struct mddev *mddev, char *page)
4756{
4757    struct r5conf *conf = mddev->private;
4758    if (conf)
4759        return sprintf(page, "%d\n", conf->bypass_threshold);
4760    else
4761        return 0;
4762}
4763
4764static ssize_t
4765raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
4766{
4767    struct r5conf *conf = mddev->private;
4768    unsigned long new;
4769    if (len >= PAGE_SIZE)
4770        return -EINVAL;
4771    if (!conf)
4772        return -ENODEV;
4773
4774    if (strict_strtoul(page, 10, &new))
4775        return -EINVAL;
4776    if (new > conf->max_nr_stripes)
4777        return -EINVAL;
4778    conf->bypass_threshold = new;
4779    return len;
4780}
4781
4782static struct md_sysfs_entry
4783raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4784                    S_IRUGO | S_IWUSR,
4785                    raid5_show_preread_threshold,
4786                    raid5_store_preread_threshold);
4787
4788static ssize_t
4789stripe_cache_active_show(struct mddev *mddev, char *page)
4790{
4791    struct r5conf *conf = mddev->private;
4792    if (conf)
4793        return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4794    else
4795        return 0;
4796}
4797
4798static struct md_sysfs_entry
4799raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4800
4801static struct attribute *raid5_attrs[] = {
4802    &raid5_stripecache_size.attr,
4803    &raid5_stripecache_active.attr,
4804    &raid5_preread_bypass_threshold.attr,
4805    NULL,
4806};
4807static struct attribute_group raid5_attrs_group = {
4808    .name = NULL,
4809    .attrs = raid5_attrs,
4810};
4811
4812static sector_t
4813raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
4814{
4815    struct r5conf *conf = mddev->private;
4816
4817    if (!sectors)
4818        sectors = mddev->dev_sectors;
4819    if (!raid_disks)
4820        /* size is defined by the smallest of previous and new size */
4821        raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
4822
4823    sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4824    sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
4825    return sectors * (raid_disks - conf->max_degraded);
4826}
4827
4828static void raid5_free_percpu(struct r5conf *conf)
4829{
4830    struct raid5_percpu *percpu;
4831    unsigned long cpu;
4832
4833    if (!conf->percpu)
4834        return;
4835
4836    get_online_cpus();
4837    for_each_possible_cpu(cpu) {
4838        percpu = per_cpu_ptr(conf->percpu, cpu);
4839        safe_put_page(percpu->spare_page);
4840        kfree(percpu->scribble);
4841    }
4842#ifdef CONFIG_HOTPLUG_CPU
4843    unregister_cpu_notifier(&conf->cpu_notify);
4844#endif
4845    put_online_cpus();
4846
4847    free_percpu(conf->percpu);
4848}
4849
4850static void free_conf(struct r5conf *conf)
4851{
4852    shrink_stripes(conf);
4853    raid5_free_percpu(conf);
4854    kfree(conf->disks);
4855    kfree(conf->stripe_hashtbl);
4856    kfree(conf);
4857}
4858
4859#ifdef CONFIG_HOTPLUG_CPU
4860static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4861                  void *hcpu)
4862{
4863    struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
4864    long cpu = (long)hcpu;
4865    struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4866
4867    switch (action) {
4868    case CPU_UP_PREPARE:
4869    case CPU_UP_PREPARE_FROZEN:
4870        if (conf->level == 6 && !percpu->spare_page)
4871            percpu->spare_page = alloc_page(GFP_KERNEL);
4872        if (!percpu->scribble)
4873            percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4874
4875        if (!percpu->scribble ||
4876            (conf->level == 6 && !percpu->spare_page)) {
4877            safe_put_page(percpu->spare_page);
4878            kfree(percpu->scribble);
4879            pr_err("%s: failed memory allocation for cpu%ld\n",
4880                   __func__, cpu);
4881            return notifier_from_errno(-ENOMEM);
4882        }
4883        break;
4884    case CPU_DEAD:
4885    case CPU_DEAD_FROZEN:
4886        safe_put_page(percpu->spare_page);
4887        kfree(percpu->scribble);
4888        percpu->spare_page = NULL;
4889        percpu->scribble = NULL;
4890        break;
4891    default:
4892        break;
4893    }
4894    return NOTIFY_OK;
4895}
4896#endif
4897
4898static int raid5_alloc_percpu(struct r5conf *conf)
4899{
4900    unsigned long cpu;
4901    struct page *spare_page;
4902    struct raid5_percpu __percpu *allcpus;
4903    void *scribble;
4904    int err;
4905
4906    allcpus = alloc_percpu(struct raid5_percpu);
4907    if (!allcpus)
4908        return -ENOMEM;
4909    conf->percpu = allcpus;
4910
4911    get_online_cpus();
4912    err = 0;
4913    for_each_present_cpu(cpu) {
4914        if (conf->level == 6) {
4915            spare_page = alloc_page(GFP_KERNEL);
4916            if (!spare_page) {
4917                err = -ENOMEM;
4918                break;
4919            }
4920            per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4921        }
4922        scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4923        if (!scribble) {
4924            err = -ENOMEM;
4925            break;
4926        }
4927        per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4928    }
4929#ifdef CONFIG_HOTPLUG_CPU
4930    conf->cpu_notify.notifier_call = raid456_cpu_notify;
4931    conf->cpu_notify.priority = 0;
4932    if (err == 0)
4933        err = register_cpu_notifier(&conf->cpu_notify);
4934#endif
4935    put_online_cpus();
4936
4937    return err;
4938}
4939
4940static struct r5conf *setup_conf(struct mddev *mddev)
4941{
4942    struct r5conf *conf;
4943    int raid_disk, memory, max_disks;
4944    struct md_rdev *rdev;
4945    struct disk_info *disk;
4946    char pers_name[6];
4947
4948    if (mddev->new_level != 5
4949        && mddev->new_level != 4
4950        && mddev->new_level != 6) {
4951        printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
4952               mdname(mddev), mddev->new_level);
4953        return ERR_PTR(-EIO);
4954    }
4955    if ((mddev->new_level == 5
4956         && !algorithm_valid_raid5(mddev->new_layout)) ||
4957        (mddev->new_level == 6
4958         && !algorithm_valid_raid6(mddev->new_layout))) {
4959        printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
4960               mdname(mddev), mddev->new_layout);
4961        return ERR_PTR(-EIO);
4962    }
4963    if (mddev->new_level == 6 && mddev->raid_disks < 4) {
4964        printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
4965               mdname(mddev), mddev->raid_disks);
4966        return ERR_PTR(-EINVAL);
4967    }
4968
4969    if (!mddev->new_chunk_sectors ||
4970        (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
4971        !is_power_of_2(mddev->new_chunk_sectors)) {
4972        printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
4973               mdname(mddev), mddev->new_chunk_sectors << 9);
4974        return ERR_PTR(-EINVAL);
4975    }
4976
4977    conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
4978    if (conf == NULL)
4979        goto abort;
4980    spin_lock_init(&conf->device_lock);
4981    init_waitqueue_head(&conf->wait_for_stripe);
4982    init_waitqueue_head(&conf->wait_for_overlap);
4983    INIT_LIST_HEAD(&conf->handle_list);
4984    INIT_LIST_HEAD(&conf->hold_list);
4985    INIT_LIST_HEAD(&conf->delayed_list);
4986    INIT_LIST_HEAD(&conf->bitmap_list);
4987    INIT_LIST_HEAD(&conf->inactive_list);
4988    atomic_set(&conf->active_stripes, 0);
4989    atomic_set(&conf->preread_active_stripes, 0);
4990    atomic_set(&conf->active_aligned_reads, 0);
4991    conf->bypass_threshold = BYPASS_THRESHOLD;
4992    conf->recovery_disabled = mddev->recovery_disabled - 1;
4993
4994    conf->raid_disks = mddev->raid_disks;
4995    if (mddev->reshape_position == MaxSector)
4996        conf->previous_raid_disks = mddev->raid_disks;
4997    else
4998        conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4999    max_disks = max(conf->raid_disks, conf->previous_raid_disks);
5000    conf->scribble_len = scribble_len(max_disks);
5001
5002    conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
5003                  GFP_KERNEL);
5004    if (!conf->disks)
5005        goto abort;
5006
5007    conf->mddev = mddev;
5008
5009    if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
5010        goto abort;
5011
5012    conf->level = mddev->new_level;
5013    if (raid5_alloc_percpu(conf) != 0)
5014        goto abort;
5015
5016    pr_debug("raid456: run(%s) called.\n", mdname(mddev));
5017
5018    rdev_for_each(rdev, mddev) {
5019        raid_disk = rdev->raid_disk;
5020        if (raid_disk >= max_disks
5021            || raid_disk < 0)
5022            continue;
5023        disk = conf->disks + raid_disk;
5024
5025        if (test_bit(Replacement, &rdev->flags)) {
5026            if (disk->replacement)
5027                goto abort;
5028            disk->replacement = rdev;
5029        } else {
5030            if (disk->rdev)
5031                goto abort;
5032            disk->rdev = rdev;
5033        }
5034
5035        if (test_bit(In_sync, &rdev->flags)) {
5036            char b[BDEVNAME_SIZE];
5037            printk(KERN_INFO "md/raid:%s: device %s operational as raid"
5038                   " disk %d\n",
5039                   mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
5040        } else if (rdev->saved_raid_disk != raid_disk)
5041            /* Cannot rely on bitmap to complete recovery */
5042            conf->fullsync = 1;
5043    }
5044
5045    conf->chunk_sectors = mddev->new_chunk_sectors;
5046    conf->level = mddev->new_level;
5047    if (conf->level == 6)
5048        conf->max_degraded = 2;
5049    else
5050        conf->max_degraded = 1;
5051    conf->algorithm = mddev->new_layout;
5052    conf->max_nr_stripes = NR_STRIPES;
5053    conf->reshape_progress = mddev->reshape_position;
5054    if (conf->reshape_progress != MaxSector) {
5055        conf->prev_chunk_sectors = mddev->chunk_sectors;
5056        conf->prev_algo = mddev->layout;
5057    }
5058
5059    memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
5060         max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
5061    if (grow_stripes(conf, conf->max_nr_stripes)) {
5062        printk(KERN_ERR
5063               "md/raid:%s: couldn't allocate %dkB for buffers\n",
5064               mdname(mddev), memory);
5065        goto abort;
5066    } else
5067        printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
5068               mdname(mddev), memory);
5069
5070    sprintf(pers_name, "raid%d", mddev->new_level);
5071    conf->thread = md_register_thread(raid5d, mddev, pers_name);
5072    if (!conf->thread) {
5073        printk(KERN_ERR
5074               "md/raid:%s: couldn't allocate thread.\n",
5075               mdname(mddev));
5076        goto abort;
5077    }
5078
5079    return conf;
5080
5081 abort:
5082    if (conf) {
5083        free_conf(conf);
5084        return ERR_PTR(-EIO);
5085    } else
5086        return ERR_PTR(-ENOMEM);
5087}
5088
5089
5090static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
5091{
5092    switch (algo) {
5093    case ALGORITHM_PARITY_0:
5094        if (raid_disk < max_degraded)
5095            return 1;
5096        break;
5097    case ALGORITHM_PARITY_N:
5098        if (raid_disk >= raid_disks - max_degraded)
5099            return 1;
5100        break;
5101    case ALGORITHM_PARITY_0_6:
5102        if (raid_disk == 0 ||
5103            raid_disk == raid_disks - 1)
5104            return 1;
5105        break;
5106    case ALGORITHM_LEFT_ASYMMETRIC_6:
5107    case ALGORITHM_RIGHT_ASYMMETRIC_6:
5108    case ALGORITHM_LEFT_SYMMETRIC_6:
5109    case ALGORITHM_RIGHT_SYMMETRIC_6:
5110        if (raid_disk == raid_disks - 1)
5111            return 1;
5112    }
5113    return 0;
5114}
5115
5116static int run(struct mddev *mddev)
5117{
5118    struct r5conf *conf;
5119    int working_disks = 0;
5120    int dirty_parity_disks = 0;
5121    struct md_rdev *rdev;
5122    sector_t reshape_offset = 0;
5123    int i;
5124    long long min_offset_diff = 0;
5125    int first = 1;
5126
5127    if (mddev->recovery_cp != MaxSector)
5128        printk(KERN_NOTICE "md/raid:%s: not clean"
5129               " -- starting background reconstruction\n",
5130               mdname(mddev));
5131
5132    rdev_for_each(rdev, mddev) {
5133        long long diff;
5134        if (rdev->raid_disk < 0)
5135            continue;
5136        diff = (rdev->new_data_offset - rdev->data_offset);
5137        if (first) {
5138            min_offset_diff = diff;
5139            first = 0;
5140        } else if (mddev->reshape_backwards &&
5141             diff < min_offset_diff)
5142            min_offset_diff = diff;
5143        else if (!mddev->reshape_backwards &&
5144             diff > min_offset_diff)
5145            min_offset_diff = diff;
5146    }
5147
5148    if (mddev->reshape_position != MaxSector) {
5149        /* Check that we can continue the reshape.
5150         * Difficulties arise if the stripe we would write to
5151         * next is at or after the stripe we would read from next.
5152         * For a reshape that changes the number of devices, this
5153         * is only possible for a very short time, and mdadm makes
5154         * sure that time appears to have past before assembling
5155         * the array. So we fail if that time hasn't passed.
5156         * For a reshape that keeps the number of devices the same
5157         * mdadm must be monitoring the reshape can keeping the
5158         * critical areas read-only and backed up. It will start
5159         * the array in read-only mode, so we check for that.
5160         */
5161        sector_t here_new, here_old;
5162        int old_disks;
5163        int max_degraded = (mddev->level == 6 ? 2 : 1);
5164
5165        if (mddev->new_level != mddev->level) {
5166            printk(KERN_ERR "md/raid:%s: unsupported reshape "
5167                   "required - aborting.\n",
5168                   mdname(mddev));
5169            return -EINVAL;
5170        }
5171        old_disks = mddev->raid_disks - mddev->delta_disks;
5172        /* reshape_position must be on a new-stripe boundary, and one
5173         * further up in new geometry must map after here in old
5174         * geometry.
5175         */
5176        here_new = mddev->reshape_position;
5177        if (sector_div(here_new, mddev->new_chunk_sectors *
5178                   (mddev->raid_disks - max_degraded))) {
5179            printk(KERN_ERR "md/raid:%s: reshape_position not "
5180                   "on a stripe boundary\n", mdname(mddev));
5181            return -EINVAL;
5182        }
5183        reshape_offset = here_new * mddev->new_chunk_sectors;
5184        /* here_new is the stripe we will write to */
5185        here_old = mddev->reshape_position;
5186        sector_div(here_old, mddev->chunk_sectors *
5187               (old_disks-max_degraded));
5188        /* here_old is the first stripe that we might need to read
5189         * from */
5190        if (mddev->delta_disks == 0) {
5191            if ((here_new * mddev->new_chunk_sectors !=
5192                 here_old * mddev->chunk_sectors)) {
5193                printk(KERN_ERR "md/raid:%s: reshape position is"
5194                       " confused - aborting\n", mdname(mddev));
5195                return -EINVAL;
5196            }
5197            /* We cannot be sure it is safe to start an in-place
5198             * reshape. It is only safe if user-space is monitoring
5199             * and taking constant backups.
5200             * mdadm always starts a situation like this in
5201             * readonly mode so it can take control before
5202             * allowing any writes. So just check for that.
5203             */
5204            if (abs(min_offset_diff) >= mddev->chunk_sectors &&
5205                abs(min_offset_diff) >= mddev->new_chunk_sectors)
5206                /* not really in-place - so OK */;
5207            else if (mddev->ro == 0) {
5208                printk(KERN_ERR "md/raid:%s: in-place reshape "
5209                       "must be started in read-only mode "
5210                       "- aborting\n",
5211                       mdname(mddev));
5212                return -EINVAL;
5213            }
5214        } else if (mddev->reshape_backwards
5215            ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
5216               here_old * mddev->chunk_sectors)
5217            : (here_new * mddev->new_chunk_sectors >=
5218               here_old * mddev->chunk_sectors + (-min_offset_diff))) {
5219            /* Reading from the same stripe as writing to - bad */
5220            printk(KERN_ERR "md/raid:%s: reshape_position too early for "
5221                   "auto-recovery - aborting.\n",
5222                   mdname(mddev));
5223            return -EINVAL;
5224        }
5225        printk(KERN_INFO "md/raid:%s: reshape will continue\n",
5226               mdname(mddev));
5227        /* OK, we should be able to continue; */
5228    } else {
5229        BUG_ON(mddev->level != mddev->new_level);
5230        BUG_ON(mddev->layout != mddev->new_layout);
5231        BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
5232        BUG_ON(mddev->delta_disks != 0);
5233    }
5234
5235    if (mddev->private == NULL)
5236        conf = setup_conf(mddev);
5237    else
5238        conf = mddev->private;
5239
5240    if (IS_ERR(conf))
5241        return PTR_ERR(conf);
5242
5243    conf->min_offset_diff = min_offset_diff;
5244    mddev->thread = conf->thread;
5245    conf->thread = NULL;
5246    mddev->private = conf;
5247
5248    for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
5249         i++) {
5250        rdev = conf->disks[i].rdev;
5251        if (!rdev && conf->disks[i].replacement) {
5252            /* The replacement is all we have yet */
5253            rdev = conf->disks[i].replacement;
5254            conf->disks[i].replacement = NULL;
5255            clear_bit(Replacement, &rdev->flags);
5256            conf->disks[i].rdev = rdev;
5257        }
5258        if (!rdev)
5259            continue;
5260        if (conf->disks[i].replacement &&
5261            conf->reshape_progress != MaxSector) {
5262            /* replacements and reshape simply do not mix. */
5263            printk(KERN_ERR "md: cannot handle concurrent "
5264                   "replacement and reshape.\n");
5265            goto abort;
5266        }
5267        if (test_bit(In_sync, &rdev->flags)) {
5268            working_disks++;
5269            continue;
5270        }
5271        /* This disc is not fully in-sync. However if it
5272         * just stored parity (beyond the recovery_offset),
5273         * when we don't need to be concerned about the
5274         * array being dirty.
5275         * When reshape goes 'backwards', we never have
5276         * partially completed devices, so we only need
5277         * to worry about reshape going forwards.
5278         */
5279        /* Hack because v0.91 doesn't store recovery_offset properly. */
5280        if (mddev->major_version == 0 &&
5281            mddev->minor_version > 90)
5282            rdev->recovery_offset = reshape_offset;
5283            
5284        if (rdev->recovery_offset < reshape_offset) {
5285            /* We need to check old and new layout */
5286            if (!only_parity(rdev->raid_disk,
5287                     conf->algorithm,
5288                     conf->raid_disks,
5289                     conf->max_degraded))
5290                continue;
5291        }
5292        if (!only_parity(rdev->raid_disk,
5293                 conf->prev_algo,
5294                 conf->previous_raid_disks,
5295                 conf->max_degraded))
5296            continue;
5297        dirty_parity_disks++;
5298    }
5299
5300    /*
5301     * 0 for a fully functional array, 1 or 2 for a degraded array.
5302     */
5303    mddev->degraded = calc_degraded(conf);
5304
5305    if (has_failed(conf)) {
5306        printk(KERN_ERR "md/raid:%s: not enough operational devices"
5307            " (%d/%d failed)\n",
5308            mdname(mddev), mddev->degraded, conf->raid_disks);
5309        goto abort;
5310    }
5311
5312    /* device size must be a multiple of chunk size */
5313    mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
5314    mddev->resync_max_sectors = mddev->dev_sectors;
5315
5316    if (mddev->degraded > dirty_parity_disks &&
5317        mddev->recovery_cp != MaxSector) {
5318        if (mddev->ok_start_degraded)
5319            printk(KERN_WARNING
5320                   "md/raid:%s: starting dirty degraded array"
5321                   " - data corruption possible.\n",
5322                   mdname(mddev));
5323        else {
5324            printk(KERN_ERR
5325                   "md/raid:%s: cannot start dirty degraded array.\n",
5326                   mdname(mddev));
5327            goto abort;
5328        }
5329    }
5330
5331    if (mddev->degraded == 0)
5332        printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
5333               " devices, algorithm %d\n", mdname(mddev), conf->level,
5334               mddev->raid_disks-mddev->degraded, mddev->raid_disks,
5335               mddev->new_layout);
5336    else
5337        printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
5338               " out of %d devices, algorithm %d\n",
5339               mdname(mddev), conf->level,
5340               mddev->raid_disks - mddev->degraded,
5341               mddev->raid_disks, mddev->new_layout);
5342
5343    print_raid5_conf(conf);
5344
5345    if (conf->reshape_progress != MaxSector) {
5346        conf->reshape_safe = conf->reshape_progress;