Root/drivers/md/dm-mpath.c

1/*
2 * Copyright (C) 2003 Sistina Software Limited.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/device-mapper.h>
9
10#include "dm-path-selector.h"
11#include "dm-uevent.h"
12
13#include <linux/ctype.h>
14#include <linux/init.h>
15#include <linux/mempool.h>
16#include <linux/module.h>
17#include <linux/pagemap.h>
18#include <linux/slab.h>
19#include <linux/time.h>
20#include <linux/workqueue.h>
21#include <linux/delay.h>
22#include <scsi/scsi_dh.h>
23#include <linux/atomic.h>
24
25#define DM_MSG_PREFIX "multipath"
26#define DM_PG_INIT_DELAY_MSECS 2000
27#define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
28
29/* Path properties */
30struct pgpath {
31    struct list_head list;
32
33    struct priority_group *pg; /* Owning PG */
34    unsigned is_active; /* Path status */
35    unsigned fail_count; /* Cumulative failure count */
36
37    struct dm_path path;
38    struct delayed_work activate_path;
39};
40
41#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
42
43/*
44 * Paths are grouped into Priority Groups and numbered from 1 upwards.
45 * Each has a path selector which controls which path gets used.
46 */
47struct priority_group {
48    struct list_head list;
49
50    struct multipath *m; /* Owning multipath instance */
51    struct path_selector ps;
52
53    unsigned pg_num; /* Reference number */
54    unsigned bypassed; /* Temporarily bypass this PG? */
55
56    unsigned nr_pgpaths; /* Number of paths in PG */
57    struct list_head pgpaths;
58};
59
60/* Multipath context */
61struct multipath {
62    struct list_head list;
63    struct dm_target *ti;
64
65    const char *hw_handler_name;
66    char *hw_handler_params;
67
68    spinlock_t lock;
69
70    unsigned nr_priority_groups;
71    struct list_head priority_groups;
72
73    wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
74
75    unsigned pg_init_required; /* pg_init needs calling? */
76    unsigned pg_init_in_progress; /* Only one pg_init allowed at once */
77    unsigned pg_init_delay_retry; /* Delay pg_init retry? */
78
79    unsigned nr_valid_paths; /* Total number of usable paths */
80    struct pgpath *current_pgpath;
81    struct priority_group *current_pg;
82    struct priority_group *next_pg; /* Switch to this PG if set */
83    unsigned repeat_count; /* I/Os left before calling PS again */
84
85    unsigned queue_io:1; /* Must we queue all I/O? */
86    unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
87    unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
88    unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
89
90    unsigned pg_init_retries; /* Number of times to retry pg_init */
91    unsigned pg_init_count; /* Number of times pg_init called */
92    unsigned pg_init_delay_msecs; /* Number of msecs before pg_init retry */
93
94    unsigned queue_size;
95    struct work_struct process_queued_ios;
96    struct list_head queued_ios;
97
98    struct work_struct trigger_event;
99
100    /*
101     * We must use a mempool of dm_mpath_io structs so that we
102     * can resubmit bios on error.
103     */
104    mempool_t *mpio_pool;
105
106    struct mutex work_mutex;
107};
108
109/*
110 * Context information attached to each bio we process.
111 */
112struct dm_mpath_io {
113    struct pgpath *pgpath;
114    size_t nr_bytes;
115};
116
117typedef int (*action_fn) (struct pgpath *pgpath);
118
119#define MIN_IOS 256 /* Mempool size */
120
121static struct kmem_cache *_mpio_cache;
122
123static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
124static void process_queued_ios(struct work_struct *work);
125static void trigger_event(struct work_struct *work);
126static void activate_path(struct work_struct *work);
127
128
129/*-----------------------------------------------
130 * Allocation routines
131 *-----------------------------------------------*/
132
133static struct pgpath *alloc_pgpath(void)
134{
135    struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
136
137    if (pgpath) {
138        pgpath->is_active = 1;
139        INIT_DELAYED_WORK(&pgpath->activate_path, activate_path);
140    }
141
142    return pgpath;
143}
144
145static void free_pgpath(struct pgpath *pgpath)
146{
147    kfree(pgpath);
148}
149
150static struct priority_group *alloc_priority_group(void)
151{
152    struct priority_group *pg;
153
154    pg = kzalloc(sizeof(*pg), GFP_KERNEL);
155
156    if (pg)
157        INIT_LIST_HEAD(&pg->pgpaths);
158
159    return pg;
160}
161
162static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
163{
164    struct pgpath *pgpath, *tmp;
165    struct multipath *m = ti->private;
166
167    list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
168        list_del(&pgpath->list);
169        if (m->hw_handler_name)
170            scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
171        dm_put_device(ti, pgpath->path.dev);
172        free_pgpath(pgpath);
173    }
174}
175
176static void free_priority_group(struct priority_group *pg,
177                struct dm_target *ti)
178{
179    struct path_selector *ps = &pg->ps;
180
181    if (ps->type) {
182        ps->type->destroy(ps);
183        dm_put_path_selector(ps->type);
184    }
185
186    free_pgpaths(&pg->pgpaths, ti);
187    kfree(pg);
188}
189
190static struct multipath *alloc_multipath(struct dm_target *ti)
191{
192    struct multipath *m;
193
194    m = kzalloc(sizeof(*m), GFP_KERNEL);
195    if (m) {
196        INIT_LIST_HEAD(&m->priority_groups);
197        INIT_LIST_HEAD(&m->queued_ios);
198        spin_lock_init(&m->lock);
199        m->queue_io = 1;
200        m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
201        INIT_WORK(&m->process_queued_ios, process_queued_ios);
202        INIT_WORK(&m->trigger_event, trigger_event);
203        init_waitqueue_head(&m->pg_init_wait);
204        mutex_init(&m->work_mutex);
205        m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
206        if (!m->mpio_pool) {
207            kfree(m);
208            return NULL;
209        }
210        m->ti = ti;
211        ti->private = m;
212    }
213
214    return m;
215}
216
217static void free_multipath(struct multipath *m)
218{
219    struct priority_group *pg, *tmp;
220
221    list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
222        list_del(&pg->list);
223        free_priority_group(pg, m->ti);
224    }
225
226    kfree(m->hw_handler_name);
227    kfree(m->hw_handler_params);
228    mempool_destroy(m->mpio_pool);
229    kfree(m);
230}
231
232static int set_mapinfo(struct multipath *m, union map_info *info)
233{
234    struct dm_mpath_io *mpio;
235
236    mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
237    if (!mpio)
238        return -ENOMEM;
239
240    memset(mpio, 0, sizeof(*mpio));
241    info->ptr = mpio;
242
243    return 0;
244}
245
246static void clear_mapinfo(struct multipath *m, union map_info *info)
247{
248    struct dm_mpath_io *mpio = info->ptr;
249
250    info->ptr = NULL;
251    mempool_free(mpio, m->mpio_pool);
252}
253
254/*-----------------------------------------------
255 * Path selection
256 *-----------------------------------------------*/
257
258static void __pg_init_all_paths(struct multipath *m)
259{
260    struct pgpath *pgpath;
261    unsigned long pg_init_delay = 0;
262
263    m->pg_init_count++;
264    m->pg_init_required = 0;
265    if (m->pg_init_delay_retry)
266        pg_init_delay = msecs_to_jiffies(m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT ?
267                         m->pg_init_delay_msecs : DM_PG_INIT_DELAY_MSECS);
268    list_for_each_entry(pgpath, &m->current_pg->pgpaths, list) {
269        /* Skip failed paths */
270        if (!pgpath->is_active)
271            continue;
272        if (queue_delayed_work(kmpath_handlerd, &pgpath->activate_path,
273                       pg_init_delay))
274            m->pg_init_in_progress++;
275    }
276}
277
278static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
279{
280    m->current_pg = pgpath->pg;
281
282    /* Must we initialise the PG first, and queue I/O till it's ready? */
283    if (m->hw_handler_name) {
284        m->pg_init_required = 1;
285        m->queue_io = 1;
286    } else {
287        m->pg_init_required = 0;
288        m->queue_io = 0;
289    }
290
291    m->pg_init_count = 0;
292}
293
294static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
295                   size_t nr_bytes)
296{
297    struct dm_path *path;
298
299    path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
300    if (!path)
301        return -ENXIO;
302
303    m->current_pgpath = path_to_pgpath(path);
304
305    if (m->current_pg != pg)
306        __switch_pg(m, m->current_pgpath);
307
308    return 0;
309}
310
311static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
312{
313    struct priority_group *pg;
314    unsigned bypassed = 1;
315
316    if (!m->nr_valid_paths)
317        goto failed;
318
319    /* Were we instructed to switch PG? */
320    if (m->next_pg) {
321        pg = m->next_pg;
322        m->next_pg = NULL;
323        if (!__choose_path_in_pg(m, pg, nr_bytes))
324            return;
325    }
326
327    /* Don't change PG until it has no remaining paths */
328    if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
329        return;
330
331    /*
332     * Loop through priority groups until we find a valid path.
333     * First time we skip PGs marked 'bypassed'.
334     * Second time we only try the ones we skipped, but set
335     * pg_init_delay_retry so we do not hammer controllers.
336     */
337    do {
338        list_for_each_entry(pg, &m->priority_groups, list) {
339            if (pg->bypassed == bypassed)
340                continue;
341            if (!__choose_path_in_pg(m, pg, nr_bytes)) {
342                if (!bypassed)
343                    m->pg_init_delay_retry = 1;
344                return;
345            }
346        }
347    } while (bypassed--);
348
349failed:
350    m->current_pgpath = NULL;
351    m->current_pg = NULL;
352}
353
354/*
355 * Check whether bios must be queued in the device-mapper core rather
356 * than here in the target.
357 *
358 * m->lock must be held on entry.
359 *
360 * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
361 * same value then we are not between multipath_presuspend()
362 * and multipath_resume() calls and we have no need to check
363 * for the DMF_NOFLUSH_SUSPENDING flag.
364 */
365static int __must_push_back(struct multipath *m)
366{
367    return (m->queue_if_no_path != m->saved_queue_if_no_path &&
368        dm_noflush_suspending(m->ti));
369}
370
371static int map_io(struct multipath *m, struct request *clone,
372          union map_info *map_context, unsigned was_queued)
373{
374    int r = DM_MAPIO_REMAPPED;
375    size_t nr_bytes = blk_rq_bytes(clone);
376    unsigned long flags;
377    struct pgpath *pgpath;
378    struct block_device *bdev;
379    struct dm_mpath_io *mpio = map_context->ptr;
380
381    spin_lock_irqsave(&m->lock, flags);
382
383    /* Do we need to select a new pgpath? */
384    if (!m->current_pgpath ||
385        (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
386        __choose_pgpath(m, nr_bytes);
387
388    pgpath = m->current_pgpath;
389
390    if (was_queued)
391        m->queue_size--;
392
393    if ((pgpath && m->queue_io) ||
394        (!pgpath && m->queue_if_no_path)) {
395        /* Queue for the daemon to resubmit */
396        list_add_tail(&clone->queuelist, &m->queued_ios);
397        m->queue_size++;
398        if ((m->pg_init_required && !m->pg_init_in_progress) ||
399            !m->queue_io)
400            queue_work(kmultipathd, &m->process_queued_ios);
401        pgpath = NULL;
402        r = DM_MAPIO_SUBMITTED;
403    } else if (pgpath) {
404        bdev = pgpath->path.dev->bdev;
405        clone->q = bdev_get_queue(bdev);
406        clone->rq_disk = bdev->bd_disk;
407    } else if (__must_push_back(m))
408        r = DM_MAPIO_REQUEUE;
409    else
410        r = -EIO; /* Failed */
411
412    mpio->pgpath = pgpath;
413    mpio->nr_bytes = nr_bytes;
414
415    if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
416        pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
417                          nr_bytes);
418
419    spin_unlock_irqrestore(&m->lock, flags);
420
421    return r;
422}
423
424/*
425 * If we run out of usable paths, should we queue I/O or error it?
426 */
427static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
428                unsigned save_old_value)
429{
430    unsigned long flags;
431
432    spin_lock_irqsave(&m->lock, flags);
433
434    if (save_old_value)
435        m->saved_queue_if_no_path = m->queue_if_no_path;
436    else
437        m->saved_queue_if_no_path = queue_if_no_path;
438    m->queue_if_no_path = queue_if_no_path;
439    if (!m->queue_if_no_path && m->queue_size)
440        queue_work(kmultipathd, &m->process_queued_ios);
441
442    spin_unlock_irqrestore(&m->lock, flags);
443
444    return 0;
445}
446
447/*-----------------------------------------------------------------
448 * The multipath daemon is responsible for resubmitting queued ios.
449 *---------------------------------------------------------------*/
450
451static void dispatch_queued_ios(struct multipath *m)
452{
453    int r;
454    unsigned long flags;
455    union map_info *info;
456    struct request *clone, *n;
457    LIST_HEAD(cl);
458
459    spin_lock_irqsave(&m->lock, flags);
460    list_splice_init(&m->queued_ios, &cl);
461    spin_unlock_irqrestore(&m->lock, flags);
462
463    list_for_each_entry_safe(clone, n, &cl, queuelist) {
464        list_del_init(&clone->queuelist);
465
466        info = dm_get_rq_mapinfo(clone);
467
468        r = map_io(m, clone, info, 1);
469        if (r < 0) {
470            clear_mapinfo(m, info);
471            dm_kill_unmapped_request(clone, r);
472        } else if (r == DM_MAPIO_REMAPPED)
473            dm_dispatch_request(clone);
474        else if (r == DM_MAPIO_REQUEUE) {
475            clear_mapinfo(m, info);
476            dm_requeue_unmapped_request(clone);
477        }
478    }
479}
480
481static void process_queued_ios(struct work_struct *work)
482{
483    struct multipath *m =
484        container_of(work, struct multipath, process_queued_ios);
485    struct pgpath *pgpath = NULL;
486    unsigned must_queue = 1;
487    unsigned long flags;
488
489    spin_lock_irqsave(&m->lock, flags);
490
491    if (!m->current_pgpath)
492        __choose_pgpath(m, 0);
493
494    pgpath = m->current_pgpath;
495
496    if ((pgpath && !m->queue_io) ||
497        (!pgpath && !m->queue_if_no_path))
498        must_queue = 0;
499
500    if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
501        __pg_init_all_paths(m);
502
503    spin_unlock_irqrestore(&m->lock, flags);
504    if (!must_queue)
505        dispatch_queued_ios(m);
506}
507
508/*
509 * An event is triggered whenever a path is taken out of use.
510 * Includes path failure and PG bypass.
511 */
512static void trigger_event(struct work_struct *work)
513{
514    struct multipath *m =
515        container_of(work, struct multipath, trigger_event);
516
517    dm_table_event(m->ti->table);
518}
519
520/*-----------------------------------------------------------------
521 * Constructor/argument parsing:
522 * <#multipath feature args> [<arg>]*
523 * <#hw_handler args> [hw_handler [<arg>]*]
524 * <#priority groups>
525 * <initial priority group>
526 * [<selector> <#selector args> [<arg>]*
527 * <#paths> <#per-path selector args>
528 * [<path> [<arg>]* ]+ ]+
529 *---------------------------------------------------------------*/
530static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
531                   struct dm_target *ti)
532{
533    int r;
534    struct path_selector_type *pst;
535    unsigned ps_argc;
536
537    static struct dm_arg _args[] = {
538        {0, 1024, "invalid number of path selector args"},
539    };
540
541    pst = dm_get_path_selector(dm_shift_arg(as));
542    if (!pst) {
543        ti->error = "unknown path selector type";
544        return -EINVAL;
545    }
546
547    r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
548    if (r) {
549        dm_put_path_selector(pst);
550        return -EINVAL;
551    }
552
553    r = pst->create(&pg->ps, ps_argc, as->argv);
554    if (r) {
555        dm_put_path_selector(pst);
556        ti->error = "path selector constructor failed";
557        return r;
558    }
559
560    pg->ps.type = pst;
561    dm_consume_args(as, ps_argc);
562
563    return 0;
564}
565
566static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
567                   struct dm_target *ti)
568{
569    int r;
570    struct pgpath *p;
571    struct multipath *m = ti->private;
572    struct request_queue *q = NULL;
573    const char *attached_handler_name;
574
575    /* we need at least a path arg */
576    if (as->argc < 1) {
577        ti->error = "no device given";
578        return ERR_PTR(-EINVAL);
579    }
580
581    p = alloc_pgpath();
582    if (!p)
583        return ERR_PTR(-ENOMEM);
584
585    r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
586              &p->path.dev);
587    if (r) {
588        ti->error = "error getting device";
589        goto bad;
590    }
591
592    if (m->retain_attached_hw_handler || m->hw_handler_name)
593        q = bdev_get_queue(p->path.dev->bdev);
594
595    if (m->retain_attached_hw_handler) {
596        attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
597        if (attached_handler_name) {
598            /*
599             * Reset hw_handler_name to match the attached handler
600             * and clear any hw_handler_params associated with the
601             * ignored handler.
602             *
603             * NB. This modifies the table line to show the actual
604             * handler instead of the original table passed in.
605             */
606            kfree(m->hw_handler_name);
607            m->hw_handler_name = attached_handler_name;
608
609            kfree(m->hw_handler_params);
610            m->hw_handler_params = NULL;
611        }
612    }
613
614    if (m->hw_handler_name) {
615        /*
616         * Increments scsi_dh reference, even when using an
617         * already-attached handler.
618         */
619        r = scsi_dh_attach(q, m->hw_handler_name);
620        if (r == -EBUSY) {
621            /*
622             * Already attached to different hw_handler:
623             * try to reattach with correct one.
624             */
625            scsi_dh_detach(q);
626            r = scsi_dh_attach(q, m->hw_handler_name);
627        }
628
629        if (r < 0) {
630            ti->error = "error attaching hardware handler";
631            dm_put_device(ti, p->path.dev);
632            goto bad;
633        }
634
635        if (m->hw_handler_params) {
636            r = scsi_dh_set_params(q, m->hw_handler_params);
637            if (r < 0) {
638                ti->error = "unable to set hardware "
639                            "handler parameters";
640                scsi_dh_detach(q);
641                dm_put_device(ti, p->path.dev);
642                goto bad;
643            }
644        }
645    }
646
647    r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
648    if (r) {
649        dm_put_device(ti, p->path.dev);
650        goto bad;
651    }
652
653    return p;
654
655 bad:
656    free_pgpath(p);
657    return ERR_PTR(r);
658}
659
660static struct priority_group *parse_priority_group(struct dm_arg_set *as,
661                           struct multipath *m)
662{
663    static struct dm_arg _args[] = {
664        {1, 1024, "invalid number of paths"},
665        {0, 1024, "invalid number of selector args"}
666    };
667
668    int r;
669    unsigned i, nr_selector_args, nr_args;
670    struct priority_group *pg;
671    struct dm_target *ti = m->ti;
672
673    if (as->argc < 2) {
674        as->argc = 0;
675        ti->error = "not enough priority group arguments";
676        return ERR_PTR(-EINVAL);
677    }
678
679    pg = alloc_priority_group();
680    if (!pg) {
681        ti->error = "couldn't allocate priority group";
682        return ERR_PTR(-ENOMEM);
683    }
684    pg->m = m;
685
686    r = parse_path_selector(as, pg, ti);
687    if (r)
688        goto bad;
689
690    /*
691     * read the paths
692     */
693    r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
694    if (r)
695        goto bad;
696
697    r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
698    if (r)
699        goto bad;
700
701    nr_args = 1 + nr_selector_args;
702    for (i = 0; i < pg->nr_pgpaths; i++) {
703        struct pgpath *pgpath;
704        struct dm_arg_set path_args;
705
706        if (as->argc < nr_args) {
707            ti->error = "not enough path parameters";
708            r = -EINVAL;
709            goto bad;
710        }
711
712        path_args.argc = nr_args;
713        path_args.argv = as->argv;
714
715        pgpath = parse_path(&path_args, &pg->ps, ti);
716        if (IS_ERR(pgpath)) {
717            r = PTR_ERR(pgpath);
718            goto bad;
719        }
720
721        pgpath->pg = pg;
722        list_add_tail(&pgpath->list, &pg->pgpaths);
723        dm_consume_args(as, nr_args);
724    }
725
726    return pg;
727
728 bad:
729    free_priority_group(pg, ti);
730    return ERR_PTR(r);
731}
732
733static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
734{
735    unsigned hw_argc;
736    int ret;
737    struct dm_target *ti = m->ti;
738
739    static struct dm_arg _args[] = {
740        {0, 1024, "invalid number of hardware handler args"},
741    };
742
743    if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
744        return -EINVAL;
745
746    if (!hw_argc)
747        return 0;
748
749    m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
750    if (!try_then_request_module(scsi_dh_handler_exist(m->hw_handler_name),
751                     "scsi_dh_%s", m->hw_handler_name)) {
752        ti->error = "unknown hardware handler type";
753        ret = -EINVAL;
754        goto fail;
755    }
756
757    if (hw_argc > 1) {
758        char *p;
759        int i, j, len = 4;
760
761        for (i = 0; i <= hw_argc - 2; i++)
762            len += strlen(as->argv[i]) + 1;
763        p = m->hw_handler_params = kzalloc(len, GFP_KERNEL);
764        if (!p) {
765            ti->error = "memory allocation failed";
766            ret = -ENOMEM;
767            goto fail;
768        }
769        j = sprintf(p, "%d", hw_argc - 1);
770        for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
771            j = sprintf(p, "%s", as->argv[i]);
772    }
773    dm_consume_args(as, hw_argc - 1);
774
775    return 0;
776fail:
777    kfree(m->hw_handler_name);
778    m->hw_handler_name = NULL;
779    return ret;
780}
781
782static int parse_features(struct dm_arg_set *as, struct multipath *m)
783{
784    int r;
785    unsigned argc;
786    struct dm_target *ti = m->ti;
787    const char *arg_name;
788
789    static struct dm_arg _args[] = {
790        {0, 6, "invalid number of feature args"},
791        {1, 50, "pg_init_retries must be between 1 and 50"},
792        {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
793    };
794
795    r = dm_read_arg_group(_args, as, &argc, &ti->error);
796    if (r)
797        return -EINVAL;
798
799    if (!argc)
800        return 0;
801
802    do {
803        arg_name = dm_shift_arg(as);
804        argc--;
805
806        if (!strcasecmp(arg_name, "queue_if_no_path")) {
807            r = queue_if_no_path(m, 1, 0);
808            continue;
809        }
810
811        if (!strcasecmp(arg_name, "retain_attached_hw_handler")) {
812            m->retain_attached_hw_handler = 1;
813            continue;
814        }
815
816        if (!strcasecmp(arg_name, "pg_init_retries") &&
817            (argc >= 1)) {
818            r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
819            argc--;
820            continue;
821        }
822
823        if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
824            (argc >= 1)) {
825            r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
826            argc--;
827            continue;
828        }
829
830        ti->error = "Unrecognised multipath feature request";
831        r = -EINVAL;
832    } while (argc && !r);
833
834    return r;
835}
836
837static int multipath_ctr(struct dm_target *ti, unsigned int argc,
838             char **argv)
839{
840    /* target arguments */
841    static struct dm_arg _args[] = {
842        {0, 1024, "invalid number of priority groups"},
843        {0, 1024, "invalid initial priority group number"},
844    };
845
846    int r;
847    struct multipath *m;
848    struct dm_arg_set as;
849    unsigned pg_count = 0;
850    unsigned next_pg_num;
851
852    as.argc = argc;
853    as.argv = argv;
854
855    m = alloc_multipath(ti);
856    if (!m) {
857        ti->error = "can't allocate multipath";
858        return -EINVAL;
859    }
860
861    r = parse_features(&as, m);
862    if (r)
863        goto bad;
864
865    r = parse_hw_handler(&as, m);
866    if (r)
867        goto bad;
868
869    r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
870    if (r)
871        goto bad;
872
873    r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
874    if (r)
875        goto bad;
876
877    if ((!m->nr_priority_groups && next_pg_num) ||
878        (m->nr_priority_groups && !next_pg_num)) {
879        ti->error = "invalid initial priority group";
880        r = -EINVAL;
881        goto bad;
882    }
883
884    /* parse the priority groups */
885    while (as.argc) {
886        struct priority_group *pg;
887
888        pg = parse_priority_group(&as, m);
889        if (IS_ERR(pg)) {
890            r = PTR_ERR(pg);
891            goto bad;
892        }
893
894        m->nr_valid_paths += pg->nr_pgpaths;
895        list_add_tail(&pg->list, &m->priority_groups);
896        pg_count++;
897        pg->pg_num = pg_count;
898        if (!--next_pg_num)
899            m->next_pg = pg;
900    }
901
902    if (pg_count != m->nr_priority_groups) {
903        ti->error = "priority group count mismatch";
904        r = -EINVAL;
905        goto bad;
906    }
907
908    ti->num_flush_bios = 1;
909    ti->num_discard_bios = 1;
910
911    return 0;
912
913 bad:
914    free_multipath(m);
915    return r;
916}
917
918static void multipath_wait_for_pg_init_completion(struct multipath *m)
919{
920    DECLARE_WAITQUEUE(wait, current);
921    unsigned long flags;
922
923    add_wait_queue(&m->pg_init_wait, &wait);
924
925    while (1) {
926        set_current_state(TASK_UNINTERRUPTIBLE);
927
928        spin_lock_irqsave(&m->lock, flags);
929        if (!m->pg_init_in_progress) {
930            spin_unlock_irqrestore(&m->lock, flags);
931            break;
932        }
933        spin_unlock_irqrestore(&m->lock, flags);
934
935        io_schedule();
936    }
937    set_current_state(TASK_RUNNING);
938
939    remove_wait_queue(&m->pg_init_wait, &wait);
940}
941
942static void flush_multipath_work(struct multipath *m)
943{
944    flush_workqueue(kmpath_handlerd);
945    multipath_wait_for_pg_init_completion(m);
946    flush_workqueue(kmultipathd);
947    flush_work(&m->trigger_event);
948}
949
950static void multipath_dtr(struct dm_target *ti)
951{
952    struct multipath *m = ti->private;
953
954    flush_multipath_work(m);
955    free_multipath(m);
956}
957
958/*
959 * Map cloned requests
960 */
961static int multipath_map(struct dm_target *ti, struct request *clone,
962             union map_info *map_context)
963{
964    int r;
965    struct multipath *m = (struct multipath *) ti->private;
966
967    if (set_mapinfo(m, map_context) < 0)
968        /* ENOMEM, requeue */
969        return DM_MAPIO_REQUEUE;
970
971    clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
972    r = map_io(m, clone, map_context, 0);
973    if (r < 0 || r == DM_MAPIO_REQUEUE)
974        clear_mapinfo(m, map_context);
975
976    return r;
977}
978
979/*
980 * Take a path out of use.
981 */
982static int fail_path(struct pgpath *pgpath)
983{
984    unsigned long flags;
985    struct multipath *m = pgpath->pg->m;
986
987    spin_lock_irqsave(&m->lock, flags);
988
989    if (!pgpath->is_active)
990        goto out;
991
992    DMWARN("Failing path %s.", pgpath->path.dev->name);
993
994    pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
995    pgpath->is_active = 0;
996    pgpath->fail_count++;
997
998    m->nr_valid_paths--;
999
1000    if (pgpath == m->current_pgpath)
1001        m->current_pgpath = NULL;
1002
1003    dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
1004              pgpath->path.dev->name, m->nr_valid_paths);
1005
1006    schedule_work(&m->trigger_event);
1007
1008out:
1009    spin_unlock_irqrestore(&m->lock, flags);
1010
1011    return 0;
1012}
1013
1014/*
1015 * Reinstate a previously-failed path
1016 */
1017static int reinstate_path(struct pgpath *pgpath)
1018{
1019    int r = 0;
1020    unsigned long flags;
1021    struct multipath *m = pgpath->pg->m;
1022
1023    spin_lock_irqsave(&m->lock, flags);
1024
1025    if (pgpath->is_active)
1026        goto out;
1027
1028    if (!pgpath->pg->ps.type->reinstate_path) {
1029        DMWARN("Reinstate path not supported by path selector %s",
1030               pgpath->pg->ps.type->name);
1031        r = -EINVAL;
1032        goto out;
1033    }
1034
1035    r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
1036    if (r)
1037        goto out;
1038
1039    pgpath->is_active = 1;
1040
1041    if (!m->nr_valid_paths++ && m->queue_size) {
1042        m->current_pgpath = NULL;
1043        queue_work(kmultipathd, &m->process_queued_ios);
1044    } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
1045        if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
1046            m->pg_init_in_progress++;
1047    }
1048
1049    dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
1050              pgpath->path.dev->name, m->nr_valid_paths);
1051
1052    schedule_work(&m->trigger_event);
1053
1054out:
1055    spin_unlock_irqrestore(&m->lock, flags);
1056
1057    return r;
1058}
1059
1060/*
1061 * Fail or reinstate all paths that match the provided struct dm_dev.
1062 */
1063static int action_dev(struct multipath *m, struct dm_dev *dev,
1064              action_fn action)
1065{
1066    int r = -EINVAL;
1067    struct pgpath *pgpath;
1068    struct priority_group *pg;
1069
1070    list_for_each_entry(pg, &m->priority_groups, list) {
1071        list_for_each_entry(pgpath, &pg->pgpaths, list) {
1072            if (pgpath->path.dev == dev)
1073                r = action(pgpath);
1074        }
1075    }
1076
1077    return r;
1078}
1079
1080/*
1081 * Temporarily try to avoid having to use the specified PG
1082 */
1083static void bypass_pg(struct multipath *m, struct priority_group *pg,
1084              int bypassed)
1085{
1086    unsigned long flags;
1087
1088    spin_lock_irqsave(&m->lock, flags);
1089
1090    pg->bypassed = bypassed;
1091    m->current_pgpath = NULL;
1092    m->current_pg = NULL;
1093
1094    spin_unlock_irqrestore(&m->lock, flags);
1095
1096    schedule_work(&m->trigger_event);
1097}
1098
1099/*
1100 * Switch to using the specified PG from the next I/O that gets mapped
1101 */
1102static int switch_pg_num(struct multipath *m, const char *pgstr)
1103{
1104    struct priority_group *pg;
1105    unsigned pgnum;
1106    unsigned long flags;
1107    char dummy;
1108
1109    if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1110        (pgnum > m->nr_priority_groups)) {
1111        DMWARN("invalid PG number supplied to switch_pg_num");
1112        return -EINVAL;
1113    }
1114
1115    spin_lock_irqsave(&m->lock, flags);
1116    list_for_each_entry(pg, &m->priority_groups, list) {
1117        pg->bypassed = 0;
1118        if (--pgnum)
1119            continue;
1120
1121        m->current_pgpath = NULL;
1122        m->current_pg = NULL;
1123        m->next_pg = pg;
1124    }
1125    spin_unlock_irqrestore(&m->lock, flags);
1126
1127    schedule_work(&m->trigger_event);
1128    return 0;
1129}
1130
1131/*
1132 * Set/clear bypassed status of a PG.
1133 * PGs are numbered upwards from 1 in the order they were declared.
1134 */
1135static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
1136{
1137    struct priority_group *pg;
1138    unsigned pgnum;
1139    char dummy;
1140
1141    if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1142        (pgnum > m->nr_priority_groups)) {
1143        DMWARN("invalid PG number supplied to bypass_pg");
1144        return -EINVAL;
1145    }
1146
1147    list_for_each_entry(pg, &m->priority_groups, list) {
1148        if (!--pgnum)
1149            break;
1150    }
1151
1152    bypass_pg(m, pg, bypassed);
1153    return 0;
1154}
1155
1156/*
1157 * Should we retry pg_init immediately?
1158 */
1159static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
1160{
1161    unsigned long flags;
1162    int limit_reached = 0;
1163
1164    spin_lock_irqsave(&m->lock, flags);
1165
1166    if (m->pg_init_count <= m->pg_init_retries)
1167        m->pg_init_required = 1;
1168    else
1169        limit_reached = 1;
1170
1171    spin_unlock_irqrestore(&m->lock, flags);
1172
1173    return limit_reached;
1174}
1175
1176static void pg_init_done(void *data, int errors)
1177{
1178    struct pgpath *pgpath = data;
1179    struct priority_group *pg = pgpath->pg;
1180    struct multipath *m = pg->m;
1181    unsigned long flags;
1182    unsigned delay_retry = 0;
1183
1184    /* device or driver problems */
1185    switch (errors) {
1186    case SCSI_DH_OK:
1187        break;
1188    case SCSI_DH_NOSYS:
1189        if (!m->hw_handler_name) {
1190            errors = 0;
1191            break;
1192        }
1193        DMERR("Could not failover the device: Handler scsi_dh_%s "
1194              "Error %d.", m->hw_handler_name, errors);
1195        /*
1196         * Fail path for now, so we do not ping pong
1197         */
1198        fail_path(pgpath);
1199        break;
1200    case SCSI_DH_DEV_TEMP_BUSY:
1201        /*
1202         * Probably doing something like FW upgrade on the
1203         * controller so try the other pg.
1204         */
1205        bypass_pg(m, pg, 1);
1206        break;
1207    case SCSI_DH_RETRY:
1208        /* Wait before retrying. */
1209        delay_retry = 1;
1210    case SCSI_DH_IMM_RETRY:
1211    case SCSI_DH_RES_TEMP_UNAVAIL:
1212        if (pg_init_limit_reached(m, pgpath))
1213            fail_path(pgpath);
1214        errors = 0;
1215        break;
1216    default:
1217        /*
1218         * We probably do not want to fail the path for a device
1219         * error, but this is what the old dm did. In future
1220         * patches we can do more advanced handling.
1221         */
1222        fail_path(pgpath);
1223    }
1224
1225    spin_lock_irqsave(&m->lock, flags);
1226    if (errors) {
1227        if (pgpath == m->current_pgpath) {
1228            DMERR("Could not failover device. Error %d.", errors);
1229            m->current_pgpath = NULL;
1230            m->current_pg = NULL;
1231        }
1232    } else if (!m->pg_init_required)
1233        pg->bypassed = 0;
1234
1235    if (--m->pg_init_in_progress)
1236        /* Activations of other paths are still on going */
1237        goto out;
1238
1239    if (!m->pg_init_required)
1240        m->queue_io = 0;
1241
1242    m->pg_init_delay_retry = delay_retry;
1243    queue_work(kmultipathd, &m->process_queued_ios);
1244
1245    /*
1246     * Wake up any thread waiting to suspend.
1247     */
1248    wake_up(&m->pg_init_wait);
1249
1250out:
1251    spin_unlock_irqrestore(&m->lock, flags);
1252}
1253
1254static void activate_path(struct work_struct *work)
1255{
1256    struct pgpath *pgpath =
1257        container_of(work, struct pgpath, activate_path.work);
1258
1259    scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
1260                pg_init_done, pgpath);
1261}
1262
1263/*
1264 * end_io handling
1265 */
1266static int do_end_io(struct multipath *m, struct request *clone,
1267             int error, struct dm_mpath_io *mpio)
1268{
1269    /*
1270     * We don't queue any clone request inside the multipath target
1271     * during end I/O handling, since those clone requests don't have
1272     * bio clones. If we queue them inside the multipath target,
1273     * we need to make bio clones, that requires memory allocation.
1274     * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
1275     * don't have bio clones.)
1276     * Instead of queueing the clone request here, we queue the original
1277     * request into dm core, which will remake a clone request and
1278     * clone bios for it and resubmit it later.
1279     */
1280    int r = DM_ENDIO_REQUEUE;
1281    unsigned long flags;
1282
1283    if (!error && !clone->errors)
1284        return 0; /* I/O complete */
1285
1286    if (error == -EOPNOTSUPP || error == -EREMOTEIO || error == -EILSEQ)
1287        return error;
1288
1289    if (mpio->pgpath)
1290        fail_path(mpio->pgpath);
1291
1292    spin_lock_irqsave(&m->lock, flags);
1293    if (!m->nr_valid_paths) {
1294        if (!m->queue_if_no_path) {
1295            if (!__must_push_back(m))
1296                r = -EIO;
1297        } else {
1298            if (error == -EBADE)
1299                r = error;
1300        }
1301    }
1302    spin_unlock_irqrestore(&m->lock, flags);
1303
1304    return r;
1305}
1306
1307static int multipath_end_io(struct dm_target *ti, struct request *clone,
1308                int error, union map_info *map_context)
1309{
1310    struct multipath *m = ti->private;
1311    struct dm_mpath_io *mpio = map_context->ptr;
1312    struct pgpath *pgpath;
1313    struct path_selector *ps;
1314    int r;
1315
1316    BUG_ON(!mpio);
1317
1318    r = do_end_io(m, clone, error, mpio);
1319    pgpath = mpio->pgpath;
1320    if (pgpath) {
1321        ps = &pgpath->pg->ps;
1322        if (ps->type->end_io)
1323            ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1324    }
1325    clear_mapinfo(m, map_context);
1326
1327    return r;
1328}
1329
1330/*
1331 * Suspend can't complete until all the I/O is processed so if
1332 * the last path fails we must error any remaining I/O.
1333 * Note that if the freeze_bdev fails while suspending, the
1334 * queue_if_no_path state is lost - userspace should reset it.
1335 */
1336static void multipath_presuspend(struct dm_target *ti)
1337{
1338    struct multipath *m = (struct multipath *) ti->private;
1339
1340    queue_if_no_path(m, 0, 1);
1341}
1342
1343static void multipath_postsuspend(struct dm_target *ti)
1344{
1345    struct multipath *m = ti->private;
1346
1347    mutex_lock(&m->work_mutex);
1348    flush_multipath_work(m);
1349    mutex_unlock(&m->work_mutex);
1350}
1351
1352/*
1353 * Restore the queue_if_no_path setting.
1354 */
1355static void multipath_resume(struct dm_target *ti)
1356{
1357    struct multipath *m = (struct multipath *) ti->private;
1358    unsigned long flags;
1359
1360    spin_lock_irqsave(&m->lock, flags);
1361    m->queue_if_no_path = m->saved_queue_if_no_path;
1362    spin_unlock_irqrestore(&m->lock, flags);
1363}
1364
1365/*
1366 * Info output has the following format:
1367 * num_multipath_feature_args [multipath_feature_args]*
1368 * num_handler_status_args [handler_status_args]*
1369 * num_groups init_group_number
1370 * [A|D|E num_ps_status_args [ps_status_args]*
1371 * num_paths num_selector_args
1372 * [path_dev A|F fail_count [selector_args]* ]+ ]+
1373 *
1374 * Table output has the following format (identical to the constructor string):
1375 * num_feature_args [features_args]*
1376 * num_handler_args hw_handler [hw_handler_args]*
1377 * num_groups init_group_number
1378 * [priority selector-name num_ps_args [ps_args]*
1379 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1380 */
1381static void multipath_status(struct dm_target *ti, status_type_t type,
1382                 unsigned status_flags, char *result, unsigned maxlen)
1383{
1384    int sz = 0;
1385    unsigned long flags;
1386    struct multipath *m = (struct multipath *) ti->private;
1387    struct priority_group *pg;
1388    struct pgpath *p;
1389    unsigned pg_num;
1390    char state;
1391
1392    spin_lock_irqsave(&m->lock, flags);
1393
1394    /* Features */
1395    if (type == STATUSTYPE_INFO)
1396        DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
1397    else {
1398        DMEMIT("%u ", m->queue_if_no_path +
1399                  (m->pg_init_retries > 0) * 2 +
1400                  (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 +
1401                  m->retain_attached_hw_handler);
1402        if (m->queue_if_no_path)
1403            DMEMIT("queue_if_no_path ");
1404        if (m->pg_init_retries)
1405            DMEMIT("pg_init_retries %u ", m->pg_init_retries);
1406        if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT)
1407            DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs);
1408        if (m->retain_attached_hw_handler)
1409            DMEMIT("retain_attached_hw_handler ");
1410    }
1411
1412    if (!m->hw_handler_name || type == STATUSTYPE_INFO)
1413        DMEMIT("0 ");
1414    else
1415        DMEMIT("1 %s ", m->hw_handler_name);
1416
1417    DMEMIT("%u ", m->nr_priority_groups);
1418
1419    if (m->next_pg)
1420        pg_num = m->next_pg->pg_num;
1421    else if (m->current_pg)
1422        pg_num = m->current_pg->pg_num;
1423    else
1424        pg_num = (m->nr_priority_groups ? 1 : 0);
1425
1426    DMEMIT("%u ", pg_num);
1427
1428    switch (type) {
1429    case STATUSTYPE_INFO:
1430        list_for_each_entry(pg, &m->priority_groups, list) {
1431            if (pg->bypassed)
1432                state = 'D'; /* Disabled */
1433            else if (pg == m->current_pg)
1434                state = 'A'; /* Currently Active */
1435            else
1436                state = 'E'; /* Enabled */
1437
1438            DMEMIT("%c ", state);
1439
1440            if (pg->ps.type->status)
1441                sz += pg->ps.type->status(&pg->ps, NULL, type,
1442                              result + sz,
1443                              maxlen - sz);
1444            else
1445                DMEMIT("0 ");
1446
1447            DMEMIT("%u %u ", pg->nr_pgpaths,
1448                   pg->ps.type->info_args);
1449
1450            list_for_each_entry(p, &pg->pgpaths, list) {
1451                DMEMIT("%s %s %u ", p->path.dev->name,
1452                       p->is_active ? "A" : "F",
1453                       p->fail_count);
1454                if (pg->ps.type->status)
1455                    sz += pg->ps.type->status(&pg->ps,
1456                          &p->path, type, result + sz,
1457                          maxlen - sz);
1458            }
1459        }
1460        break;
1461
1462    case STATUSTYPE_TABLE:
1463        list_for_each_entry(pg, &m->priority_groups, list) {
1464            DMEMIT("%s ", pg->ps.type->name);
1465
1466            if (pg->ps.type->status)
1467                sz += pg->ps.type->status(&pg->ps, NULL, type,
1468                              result + sz,
1469                              maxlen - sz);
1470            else
1471                DMEMIT("0 ");
1472
1473            DMEMIT("%u %u ", pg->nr_pgpaths,
1474                   pg->ps.type->table_args);
1475
1476            list_for_each_entry(p, &pg->pgpaths, list) {
1477                DMEMIT("%s ", p->path.dev->name);
1478                if (pg->ps.type->status)
1479                    sz += pg->ps.type->status(&pg->ps,
1480                          &p->path, type, result + sz,
1481                          maxlen - sz);
1482            }
1483        }
1484        break;
1485    }
1486
1487    spin_unlock_irqrestore(&m->lock, flags);
1488}
1489
1490static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1491{
1492    int r = -EINVAL;
1493    struct dm_dev *dev;
1494    struct multipath *m = (struct multipath *) ti->private;
1495    action_fn action;
1496
1497    mutex_lock(&m->work_mutex);
1498
1499    if (dm_suspended(ti)) {
1500        r = -EBUSY;
1501        goto out;
1502    }
1503
1504    if (argc == 1) {
1505        if (!strcasecmp(argv[0], "queue_if_no_path")) {
1506            r = queue_if_no_path(m, 1, 0);
1507            goto out;
1508        } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
1509            r = queue_if_no_path(m, 0, 0);
1510            goto out;
1511        }
1512    }
1513
1514    if (argc != 2) {
1515        DMWARN("Unrecognised multipath message received.");
1516        goto out;
1517    }
1518
1519    if (!strcasecmp(argv[0], "disable_group")) {
1520        r = bypass_pg_num(m, argv[1], 1);
1521        goto out;
1522    } else if (!strcasecmp(argv[0], "enable_group")) {
1523        r = bypass_pg_num(m, argv[1], 0);
1524        goto out;
1525    } else if (!strcasecmp(argv[0], "switch_group")) {
1526        r = switch_pg_num(m, argv[1]);
1527        goto out;
1528    } else if (!strcasecmp(argv[0], "reinstate_path"))
1529        action = reinstate_path;
1530    else if (!strcasecmp(argv[0], "fail_path"))
1531        action = fail_path;
1532    else {
1533        DMWARN("Unrecognised multipath message received.");
1534        goto out;
1535    }
1536
1537    r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev);
1538    if (r) {
1539        DMWARN("message: error getting device %s",
1540               argv[1]);
1541        goto out;
1542    }
1543
1544    r = action_dev(m, dev, action);
1545
1546    dm_put_device(ti, dev);
1547
1548out:
1549    mutex_unlock(&m->work_mutex);
1550    return r;
1551}
1552
1553static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
1554               unsigned long arg)
1555{
1556    struct multipath *m = ti->private;
1557    struct pgpath *pgpath;
1558    struct block_device *bdev;
1559    fmode_t mode;
1560    unsigned long flags;
1561    int r;
1562
1563again:
1564    bdev = NULL;
1565    mode = 0;
1566    r = 0;
1567
1568    spin_lock_irqsave(&m->lock, flags);
1569
1570    if (!m->current_pgpath)
1571        __choose_pgpath(m, 0);
1572
1573    pgpath = m->current_pgpath;
1574
1575    if (pgpath) {
1576        bdev = pgpath->path.dev->bdev;
1577        mode = pgpath->path.dev->mode;
1578    }
1579
1580    if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
1581        r = -EAGAIN;
1582    else if (!bdev)
1583        r = -EIO;
1584
1585    spin_unlock_irqrestore(&m->lock, flags);
1586
1587    /*
1588     * Only pass ioctls through if the device sizes match exactly.
1589     */
1590    if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
1591        r = scsi_verify_blk_ioctl(NULL, cmd);
1592
1593    if (r == -EAGAIN && !fatal_signal_pending(current)) {
1594        queue_work(kmultipathd, &m->process_queued_ios);
1595        msleep(10);
1596        goto again;
1597    }
1598
1599    return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
1600}
1601
1602static int multipath_iterate_devices(struct dm_target *ti,
1603                     iterate_devices_callout_fn fn, void *data)
1604{
1605    struct multipath *m = ti->private;
1606    struct priority_group *pg;
1607    struct pgpath *p;
1608    int ret = 0;
1609
1610    list_for_each_entry(pg, &m->priority_groups, list) {
1611        list_for_each_entry(p, &pg->pgpaths, list) {
1612            ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
1613            if (ret)
1614                goto out;
1615        }
1616    }
1617
1618out:
1619    return ret;
1620}
1621
1622static int __pgpath_busy(struct pgpath *pgpath)
1623{
1624    struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
1625
1626    return dm_underlying_device_busy(q);
1627}
1628
1629/*
1630 * We return "busy", only when we can map I/Os but underlying devices
1631 * are busy (so even if we map I/Os now, the I/Os will wait on
1632 * the underlying queue).
1633 * In other words, if we want to kill I/Os or queue them inside us
1634 * due to map unavailability, we don't return "busy". Otherwise,
1635 * dm core won't give us the I/Os and we can't do what we want.
1636 */
1637static int multipath_busy(struct dm_target *ti)
1638{
1639    int busy = 0, has_active = 0;
1640    struct multipath *m = ti->private;
1641    struct priority_group *pg;
1642    struct pgpath *pgpath;
1643    unsigned long flags;
1644
1645    spin_lock_irqsave(&m->lock, flags);
1646
1647    /* Guess which priority_group will be used at next mapping time */
1648    if (unlikely(!m->current_pgpath && m->next_pg))
1649        pg = m->next_pg;
1650    else if (likely(m->current_pg))
1651        pg = m->current_pg;
1652    else
1653        /*
1654         * We don't know which pg will be used at next mapping time.
1655         * We don't call __choose_pgpath() here to avoid to trigger
1656         * pg_init just by busy checking.
1657         * So we don't know whether underlying devices we will be using
1658         * at next mapping time are busy or not. Just try mapping.
1659         */
1660        goto out;
1661
1662    /*
1663     * If there is one non-busy active path at least, the path selector
1664     * will be able to select it. So we consider such a pg as not busy.
1665     */
1666    busy = 1;
1667    list_for_each_entry(pgpath, &pg->pgpaths, list)
1668        if (pgpath->is_active) {
1669            has_active = 1;
1670
1671            if (!__pgpath_busy(pgpath)) {
1672                busy = 0;
1673                break;
1674            }
1675        }
1676
1677    if (!has_active)
1678        /*
1679         * No active path in this pg, so this pg won't be used and
1680         * the current_pg will be changed at next mapping time.
1681         * We need to try mapping to determine it.
1682         */
1683        busy = 0;
1684
1685out:
1686    spin_unlock_irqrestore(&m->lock, flags);
1687
1688    return busy;
1689}
1690
1691/*-----------------------------------------------------------------
1692 * Module setup
1693 *---------------------------------------------------------------*/
1694static struct target_type multipath_target = {
1695    .name = "multipath",
1696    .version = {1, 5, 1},
1697    .module = THIS_MODULE,
1698    .ctr = multipath_ctr,
1699    .dtr = multipath_dtr,
1700    .map_rq = multipath_map,
1701    .rq_end_io = multipath_end_io,
1702    .presuspend = multipath_presuspend,
1703    .postsuspend = multipath_postsuspend,
1704    .resume = multipath_resume,
1705    .status = multipath_status,
1706    .message = multipath_message,
1707    .ioctl = multipath_ioctl,
1708    .iterate_devices = multipath_iterate_devices,
1709    .busy = multipath_busy,
1710};
1711
1712static int __init dm_multipath_init(void)
1713{
1714    int r;
1715
1716    /* allocate a slab for the dm_ios */
1717    _mpio_cache = KMEM_CACHE(dm_mpath_io, 0);
1718    if (!_mpio_cache)
1719        return -ENOMEM;
1720
1721    r = dm_register_target(&multipath_target);
1722    if (r < 0) {
1723        DMERR("register failed %d", r);
1724        kmem_cache_destroy(_mpio_cache);
1725        return -EINVAL;
1726    }
1727
1728    kmultipathd = alloc_workqueue("kmpathd", WQ_MEM_RECLAIM, 0);
1729    if (!kmultipathd) {
1730        DMERR("failed to create workqueue kmpathd");
1731        dm_unregister_target(&multipath_target);
1732        kmem_cache_destroy(_mpio_cache);
1733        return -ENOMEM;
1734    }
1735
1736    /*
1737     * A separate workqueue is used to handle the device handlers
1738     * to avoid overloading existing workqueue. Overloading the
1739     * old workqueue would also create a bottleneck in the
1740     * path of the storage hardware device activation.
1741     */
1742    kmpath_handlerd = alloc_ordered_workqueue("kmpath_handlerd",
1743                          WQ_MEM_RECLAIM);
1744    if (!kmpath_handlerd) {
1745        DMERR("failed to create workqueue kmpath_handlerd");
1746        destroy_workqueue(kmultipathd);
1747        dm_unregister_target(&multipath_target);
1748        kmem_cache_destroy(_mpio_cache);
1749        return -ENOMEM;
1750    }
1751
1752    DMINFO("version %u.%u.%u loaded",
1753           multipath_target.version[0], multipath_target.version[1],
1754           multipath_target.version[2]);
1755
1756    return r;
1757}
1758
1759static void __exit dm_multipath_exit(void)
1760{
1761    destroy_workqueue(kmpath_handlerd);
1762    destroy_workqueue(kmultipathd);
1763
1764    dm_unregister_target(&multipath_target);
1765    kmem_cache_destroy(_mpio_cache);
1766}
1767
1768module_init(dm_multipath_init);
1769module_exit(dm_multipath_exit);
1770
1771MODULE_DESCRIPTION(DM_NAME " multipath target");
1772MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1773MODULE_LICENSE("GPL");
1774

Archive Download this file



interactive