Root/drivers/block/xen-blkfront.c

1/*
2 * blkfront.c
3 *
4 * XenLinux virtual block device driver.
5 *
6 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7 * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8 * Copyright (c) 2004, Christian Limpach
9 * Copyright (c) 2004, Andrew Warfield
10 * Copyright (c) 2005, Christopher Clark
11 * Copyright (c) 2005, XenSource Ltd
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License version 2
15 * as published by the Free Software Foundation; or, when distributed
16 * separately from the Linux kernel or incorporated into other
17 * software packages, subject to the following license:
18 *
19 * Permission is hereby granted, free of charge, to any person obtaining a copy
20 * of this source file (the "Software"), to deal in the Software without
21 * restriction, including without limitation the rights to use, copy, modify,
22 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23 * and to permit persons to whom the Software is furnished to do so, subject to
24 * the following conditions:
25 *
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35 * IN THE SOFTWARE.
36 */
37
38#include <linux/interrupt.h>
39#include <linux/blkdev.h>
40#include <linux/hdreg.h>
41#include <linux/cdrom.h>
42#include <linux/module.h>
43#include <linux/slab.h>
44#include <linux/mutex.h>
45#include <linux/scatterlist.h>
46#include <linux/bitmap.h>
47
48#include <xen/xen.h>
49#include <xen/xenbus.h>
50#include <xen/grant_table.h>
51#include <xen/events.h>
52#include <xen/page.h>
53#include <xen/platform_pci.h>
54
55#include <xen/interface/grant_table.h>
56#include <xen/interface/io/blkif.h>
57#include <xen/interface/io/protocols.h>
58
59#include <asm/xen/hypervisor.h>
60
61enum blkif_state {
62    BLKIF_STATE_DISCONNECTED,
63    BLKIF_STATE_CONNECTED,
64    BLKIF_STATE_SUSPENDED,
65};
66
67struct blk_shadow {
68    struct blkif_request req;
69    struct request *request;
70    unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
71};
72
73static DEFINE_MUTEX(blkfront_mutex);
74static const struct block_device_operations xlvbd_block_fops;
75
76#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
77
78/*
79 * We have one of these per vbd, whether ide, scsi or 'other'. They
80 * hang in private_data off the gendisk structure. We may end up
81 * putting all kinds of interesting stuff here :-)
82 */
83struct blkfront_info
84{
85    spinlock_t io_lock;
86    struct mutex mutex;
87    struct xenbus_device *xbdev;
88    struct gendisk *gd;
89    int vdevice;
90    blkif_vdev_t handle;
91    enum blkif_state connected;
92    int ring_ref;
93    struct blkif_front_ring ring;
94    struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
95    unsigned int evtchn, irq;
96    struct request_queue *rq;
97    struct work_struct work;
98    struct gnttab_free_callback callback;
99    struct blk_shadow shadow[BLK_RING_SIZE];
100    unsigned long shadow_free;
101    unsigned int feature_flush;
102    unsigned int flush_op;
103    unsigned int feature_discard:1;
104    unsigned int feature_secdiscard:1;
105    unsigned int discard_granularity;
106    unsigned int discard_alignment;
107    int is_ready;
108};
109
110static unsigned int nr_minors;
111static unsigned long *minors;
112static DEFINE_SPINLOCK(minor_lock);
113
114#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
115    (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
116#define GRANT_INVALID_REF 0
117
118#define PARTS_PER_DISK 16
119#define PARTS_PER_EXT_DISK 256
120
121#define BLKIF_MAJOR(dev) ((dev)>>8)
122#define BLKIF_MINOR(dev) ((dev) & 0xff)
123
124#define EXT_SHIFT 28
125#define EXTENDED (1<<EXT_SHIFT)
126#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
127#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
128#define EMULATED_HD_DISK_MINOR_OFFSET (0)
129#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
130#define EMULATED_SD_DISK_MINOR_OFFSET (0)
131#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
132
133#define DEV_NAME "xvd" /* name in /dev */
134
135static int get_id_from_freelist(struct blkfront_info *info)
136{
137    unsigned long free = info->shadow_free;
138    BUG_ON(free >= BLK_RING_SIZE);
139    info->shadow_free = info->shadow[free].req.u.rw.id;
140    info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
141    return free;
142}
143
144static int add_id_to_freelist(struct blkfront_info *info,
145                   unsigned long id)
146{
147    if (info->shadow[id].req.u.rw.id != id)
148        return -EINVAL;
149    if (info->shadow[id].request == NULL)
150        return -EINVAL;
151    info->shadow[id].req.u.rw.id = info->shadow_free;
152    info->shadow[id].request = NULL;
153    info->shadow_free = id;
154    return 0;
155}
156
157static const char *op_name(int op)
158{
159    static const char *const names[] = {
160        [BLKIF_OP_READ] = "read",
161        [BLKIF_OP_WRITE] = "write",
162        [BLKIF_OP_WRITE_BARRIER] = "barrier",
163        [BLKIF_OP_FLUSH_DISKCACHE] = "flush",
164        [BLKIF_OP_DISCARD] = "discard" };
165
166    if (op < 0 || op >= ARRAY_SIZE(names))
167        return "unknown";
168
169    if (!names[op])
170        return "reserved";
171
172    return names[op];
173}
174static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
175{
176    unsigned int end = minor + nr;
177    int rc;
178
179    if (end > nr_minors) {
180        unsigned long *bitmap, *old;
181
182        bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
183                 GFP_KERNEL);
184        if (bitmap == NULL)
185            return -ENOMEM;
186
187        spin_lock(&minor_lock);
188        if (end > nr_minors) {
189            old = minors;
190            memcpy(bitmap, minors,
191                   BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
192            minors = bitmap;
193            nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
194        } else
195            old = bitmap;
196        spin_unlock(&minor_lock);
197        kfree(old);
198    }
199
200    spin_lock(&minor_lock);
201    if (find_next_bit(minors, end, minor) >= end) {
202        bitmap_set(minors, minor, nr);
203        rc = 0;
204    } else
205        rc = -EBUSY;
206    spin_unlock(&minor_lock);
207
208    return rc;
209}
210
211static void xlbd_release_minors(unsigned int minor, unsigned int nr)
212{
213    unsigned int end = minor + nr;
214
215    BUG_ON(end > nr_minors);
216    spin_lock(&minor_lock);
217    bitmap_clear(minors, minor, nr);
218    spin_unlock(&minor_lock);
219}
220
221static void blkif_restart_queue_callback(void *arg)
222{
223    struct blkfront_info *info = (struct blkfront_info *)arg;
224    schedule_work(&info->work);
225}
226
227static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
228{
229    /* We don't have real geometry info, but let's at least return
230       values consistent with the size of the device */
231    sector_t nsect = get_capacity(bd->bd_disk);
232    sector_t cylinders = nsect;
233
234    hg->heads = 0xff;
235    hg->sectors = 0x3f;
236    sector_div(cylinders, hg->heads * hg->sectors);
237    hg->cylinders = cylinders;
238    if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
239        hg->cylinders = 0xffff;
240    return 0;
241}
242
243static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
244               unsigned command, unsigned long argument)
245{
246    struct blkfront_info *info = bdev->bd_disk->private_data;
247    int i;
248
249    dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
250        command, (long)argument);
251
252    switch (command) {
253    case CDROMMULTISESSION:
254        dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
255        for (i = 0; i < sizeof(struct cdrom_multisession); i++)
256            if (put_user(0, (char __user *)(argument + i)))
257                return -EFAULT;
258        return 0;
259
260    case CDROM_GET_CAPABILITY: {
261        struct gendisk *gd = info->gd;
262        if (gd->flags & GENHD_FL_CD)
263            return 0;
264        return -EINVAL;
265    }
266
267    default:
268        /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
269          command);*/
270        return -EINVAL; /* same return as native Linux */
271    }
272
273    return 0;
274}
275
276/*
277 * Generate a Xen blkfront IO request from a blk layer request. Reads
278 * and writes are handled as expected.
279 *
280 * @req: a request struct
281 */
282static int blkif_queue_request(struct request *req)
283{
284    struct blkfront_info *info = req->rq_disk->private_data;
285    unsigned long buffer_mfn;
286    struct blkif_request *ring_req;
287    unsigned long id;
288    unsigned int fsect, lsect;
289    int i, ref;
290    grant_ref_t gref_head;
291    struct scatterlist *sg;
292
293    if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
294        return 1;
295
296    if (gnttab_alloc_grant_references(
297        BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
298        gnttab_request_free_callback(
299            &info->callback,
300            blkif_restart_queue_callback,
301            info,
302            BLKIF_MAX_SEGMENTS_PER_REQUEST);
303        return 1;
304    }
305
306    /* Fill out a communications ring structure. */
307    ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
308    id = get_id_from_freelist(info);
309    info->shadow[id].request = req;
310
311    ring_req->u.rw.id = id;
312    ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
313    ring_req->u.rw.handle = info->handle;
314
315    ring_req->operation = rq_data_dir(req) ?
316        BLKIF_OP_WRITE : BLKIF_OP_READ;
317
318    if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
319        /*
320         * Ideally we can do an unordered flush-to-disk. In case the
321         * backend onlysupports barriers, use that. A barrier request
322         * a superset of FUA, so we can implement it the same
323         * way. (It's also a FLUSH+FUA, since it is
324         * guaranteed ordered WRT previous writes.)
325         */
326        ring_req->operation = info->flush_op;
327    }
328
329    if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
330        /* id, sector_number and handle are set above. */
331        ring_req->operation = BLKIF_OP_DISCARD;
332        ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
333        if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
334            ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
335        else
336            ring_req->u.discard.flag = 0;
337    } else {
338        ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
339                               info->sg);
340        BUG_ON(ring_req->u.rw.nr_segments >
341               BLKIF_MAX_SEGMENTS_PER_REQUEST);
342
343        for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
344            buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
345            fsect = sg->offset >> 9;
346            lsect = fsect + (sg->length >> 9) - 1;
347            /* install a grant reference. */
348            ref = gnttab_claim_grant_reference(&gref_head);
349            BUG_ON(ref == -ENOSPC);
350
351            gnttab_grant_foreign_access_ref(
352                    ref,
353                    info->xbdev->otherend_id,
354                    buffer_mfn,
355                    rq_data_dir(req));
356
357            info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
358            ring_req->u.rw.seg[i] =
359                    (struct blkif_request_segment) {
360                        .gref = ref,
361                        .first_sect = fsect,
362                        .last_sect = lsect };
363        }
364    }
365
366    info->ring.req_prod_pvt++;
367
368    /* Keep a private copy so we can reissue requests when recovering. */
369    info->shadow[id].req = *ring_req;
370
371    gnttab_free_grant_references(gref_head);
372
373    return 0;
374}
375
376
377static inline void flush_requests(struct blkfront_info *info)
378{
379    int notify;
380
381    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
382
383    if (notify)
384        notify_remote_via_irq(info->irq);
385}
386
387/*
388 * do_blkif_request
389 * read a block; request is in a request queue
390 */
391static void do_blkif_request(struct request_queue *rq)
392{
393    struct blkfront_info *info = NULL;
394    struct request *req;
395    int queued;
396
397    pr_debug("Entered do_blkif_request\n");
398
399    queued = 0;
400
401    while ((req = blk_peek_request(rq)) != NULL) {
402        info = req->rq_disk->private_data;
403
404        if (RING_FULL(&info->ring))
405            goto wait;
406
407        blk_start_request(req);
408
409        if ((req->cmd_type != REQ_TYPE_FS) ||
410            ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
411            !info->flush_op)) {
412            __blk_end_request_all(req, -EIO);
413            continue;
414        }
415
416        pr_debug("do_blk_req %p: cmd %p, sec %lx, "
417             "(%u/%u) buffer:%p [%s]\n",
418             req, req->cmd, (unsigned long)blk_rq_pos(req),
419             blk_rq_cur_sectors(req), blk_rq_sectors(req),
420             req->buffer, rq_data_dir(req) ? "write" : "read");
421
422        if (blkif_queue_request(req)) {
423            blk_requeue_request(rq, req);
424wait:
425            /* Avoid pointless unplugs. */
426            blk_stop_queue(rq);
427            break;
428        }
429
430        queued++;
431    }
432
433    if (queued != 0)
434        flush_requests(info);
435}
436
437static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
438{
439    struct request_queue *rq;
440    struct blkfront_info *info = gd->private_data;
441
442    rq = blk_init_queue(do_blkif_request, &info->io_lock);
443    if (rq == NULL)
444        return -1;
445
446    queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
447
448    if (info->feature_discard) {
449        queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
450        blk_queue_max_discard_sectors(rq, get_capacity(gd));
451        rq->limits.discard_granularity = info->discard_granularity;
452        rq->limits.discard_alignment = info->discard_alignment;
453        if (info->feature_secdiscard)
454            queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
455    }
456
457    /* Hard sector size and max sectors impersonate the equiv. hardware. */
458    blk_queue_logical_block_size(rq, sector_size);
459    blk_queue_max_hw_sectors(rq, 512);
460
461    /* Each segment in a request is up to an aligned page in size. */
462    blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
463    blk_queue_max_segment_size(rq, PAGE_SIZE);
464
465    /* Ensure a merged request will fit in a single I/O ring slot. */
466    blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
467
468    /* Make sure buffer addresses are sector-aligned. */
469    blk_queue_dma_alignment(rq, 511);
470
471    /* Make sure we don't use bounce buffers. */
472    blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
473
474    gd->queue = rq;
475
476    return 0;
477}
478
479
480static void xlvbd_flush(struct blkfront_info *info)
481{
482    blk_queue_flush(info->rq, info->feature_flush);
483    printk(KERN_INFO "blkfront: %s: %s: %s\n",
484           info->gd->disk_name,
485           info->flush_op == BLKIF_OP_WRITE_BARRIER ?
486        "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
487        "flush diskcache" : "barrier or flush"),
488           info->feature_flush ? "enabled" : "disabled");
489}
490
491static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
492{
493    int major;
494    major = BLKIF_MAJOR(vdevice);
495    *minor = BLKIF_MINOR(vdevice);
496    switch (major) {
497        case XEN_IDE0_MAJOR:
498            *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
499            *minor = ((*minor / 64) * PARTS_PER_DISK) +
500                EMULATED_HD_DISK_MINOR_OFFSET;
501            break;
502        case XEN_IDE1_MAJOR:
503            *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
504            *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
505                EMULATED_HD_DISK_MINOR_OFFSET;
506            break;
507        case XEN_SCSI_DISK0_MAJOR:
508            *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
509            *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
510            break;
511        case XEN_SCSI_DISK1_MAJOR:
512        case XEN_SCSI_DISK2_MAJOR:
513        case XEN_SCSI_DISK3_MAJOR:
514        case XEN_SCSI_DISK4_MAJOR:
515        case XEN_SCSI_DISK5_MAJOR:
516        case XEN_SCSI_DISK6_MAJOR:
517        case XEN_SCSI_DISK7_MAJOR:
518            *offset = (*minor / PARTS_PER_DISK) +
519                ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
520                EMULATED_SD_DISK_NAME_OFFSET;
521            *minor = *minor +
522                ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
523                EMULATED_SD_DISK_MINOR_OFFSET;
524            break;
525        case XEN_SCSI_DISK8_MAJOR:
526        case XEN_SCSI_DISK9_MAJOR:
527        case XEN_SCSI_DISK10_MAJOR:
528        case XEN_SCSI_DISK11_MAJOR:
529        case XEN_SCSI_DISK12_MAJOR:
530        case XEN_SCSI_DISK13_MAJOR:
531        case XEN_SCSI_DISK14_MAJOR:
532        case XEN_SCSI_DISK15_MAJOR:
533            *offset = (*minor / PARTS_PER_DISK) +
534                ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
535                EMULATED_SD_DISK_NAME_OFFSET;
536            *minor = *minor +
537                ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
538                EMULATED_SD_DISK_MINOR_OFFSET;
539            break;
540        case XENVBD_MAJOR:
541            *offset = *minor / PARTS_PER_DISK;
542            break;
543        default:
544            printk(KERN_WARNING "blkfront: your disk configuration is "
545                    "incorrect, please use an xvd device instead\n");
546            return -ENODEV;
547    }
548    return 0;
549}
550
551static char *encode_disk_name(char *ptr, unsigned int n)
552{
553    if (n >= 26)
554        ptr = encode_disk_name(ptr, n / 26 - 1);
555    *ptr = 'a' + n % 26;
556    return ptr + 1;
557}
558
559static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
560                   struct blkfront_info *info,
561                   u16 vdisk_info, u16 sector_size)
562{
563    struct gendisk *gd;
564    int nr_minors = 1;
565    int err;
566    unsigned int offset;
567    int minor;
568    int nr_parts;
569    char *ptr;
570
571    BUG_ON(info->gd != NULL);
572    BUG_ON(info->rq != NULL);
573
574    if ((info->vdevice>>EXT_SHIFT) > 1) {
575        /* this is above the extended range; something is wrong */
576        printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
577        return -ENODEV;
578    }
579
580    if (!VDEV_IS_EXTENDED(info->vdevice)) {
581        err = xen_translate_vdev(info->vdevice, &minor, &offset);
582        if (err)
583            return err;
584         nr_parts = PARTS_PER_DISK;
585    } else {
586        minor = BLKIF_MINOR_EXT(info->vdevice);
587        nr_parts = PARTS_PER_EXT_DISK;
588        offset = minor / nr_parts;
589        if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
590            printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
591                    "emulated IDE disks,\n\t choose an xvd device name"
592                    "from xvde on\n", info->vdevice);
593    }
594    if (minor >> MINORBITS) {
595        pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
596            info->vdevice, minor);
597        return -ENODEV;
598    }
599
600    if ((minor % nr_parts) == 0)
601        nr_minors = nr_parts;
602
603    err = xlbd_reserve_minors(minor, nr_minors);
604    if (err)
605        goto out;
606    err = -ENODEV;
607
608    gd = alloc_disk(nr_minors);
609    if (gd == NULL)
610        goto release;
611
612    strcpy(gd->disk_name, DEV_NAME);
613    ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
614    BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
615    if (nr_minors > 1)
616        *ptr = 0;
617    else
618        snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
619             "%d", minor & (nr_parts - 1));
620
621    gd->major = XENVBD_MAJOR;
622    gd->first_minor = minor;
623    gd->fops = &xlvbd_block_fops;
624    gd->private_data = info;
625    gd->driverfs_dev = &(info->xbdev->dev);
626    set_capacity(gd, capacity);
627
628    if (xlvbd_init_blk_queue(gd, sector_size)) {
629        del_gendisk(gd);
630        goto release;
631    }
632
633    info->rq = gd->queue;
634    info->gd = gd;
635
636    xlvbd_flush(info);
637
638    if (vdisk_info & VDISK_READONLY)
639        set_disk_ro(gd, 1);
640
641    if (vdisk_info & VDISK_REMOVABLE)
642        gd->flags |= GENHD_FL_REMOVABLE;
643
644    if (vdisk_info & VDISK_CDROM)
645        gd->flags |= GENHD_FL_CD;
646
647    return 0;
648
649 release:
650    xlbd_release_minors(minor, nr_minors);
651 out:
652    return err;
653}
654
655static void xlvbd_release_gendisk(struct blkfront_info *info)
656{
657    unsigned int minor, nr_minors;
658    unsigned long flags;
659
660    if (info->rq == NULL)
661        return;
662
663    spin_lock_irqsave(&info->io_lock, flags);
664
665    /* No more blkif_request(). */
666    blk_stop_queue(info->rq);
667
668    /* No more gnttab callback work. */
669    gnttab_cancel_free_callback(&info->callback);
670    spin_unlock_irqrestore(&info->io_lock, flags);
671
672    /* Flush gnttab callback work. Must be done with no locks held. */
673    flush_work_sync(&info->work);
674
675    del_gendisk(info->gd);
676
677    minor = info->gd->first_minor;
678    nr_minors = info->gd->minors;
679    xlbd_release_minors(minor, nr_minors);
680
681    blk_cleanup_queue(info->rq);
682    info->rq = NULL;
683
684    put_disk(info->gd);
685    info->gd = NULL;
686}
687
688static void kick_pending_request_queues(struct blkfront_info *info)
689{
690    if (!RING_FULL(&info->ring)) {
691        /* Re-enable calldowns. */
692        blk_start_queue(info->rq);
693        /* Kick things off immediately. */
694        do_blkif_request(info->rq);
695    }
696}
697
698static void blkif_restart_queue(struct work_struct *work)
699{
700    struct blkfront_info *info = container_of(work, struct blkfront_info, work);
701
702    spin_lock_irq(&info->io_lock);
703    if (info->connected == BLKIF_STATE_CONNECTED)
704        kick_pending_request_queues(info);
705    spin_unlock_irq(&info->io_lock);
706}
707
708static void blkif_free(struct blkfront_info *info, int suspend)
709{
710    /* Prevent new requests being issued until we fix things up. */
711    spin_lock_irq(&info->io_lock);
712    info->connected = suspend ?
713        BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
714    /* No more blkif_request(). */
715    if (info->rq)
716        blk_stop_queue(info->rq);
717    /* No more gnttab callback work. */
718    gnttab_cancel_free_callback(&info->callback);
719    spin_unlock_irq(&info->io_lock);
720
721    /* Flush gnttab callback work. Must be done with no locks held. */
722    flush_work_sync(&info->work);
723
724    /* Free resources associated with old device channel. */
725    if (info->ring_ref != GRANT_INVALID_REF) {
726        gnttab_end_foreign_access(info->ring_ref, 0,
727                      (unsigned long)info->ring.sring);
728        info->ring_ref = GRANT_INVALID_REF;
729        info->ring.sring = NULL;
730    }
731    if (info->irq)
732        unbind_from_irqhandler(info->irq, info);
733    info->evtchn = info->irq = 0;
734
735}
736
737static void blkif_completion(struct blk_shadow *s)
738{
739    int i;
740    /* Do not let BLKIF_OP_DISCARD as nr_segment is in the same place
741     * flag. */
742    for (i = 0; i < s->req.u.rw.nr_segments; i++)
743        gnttab_end_foreign_access(s->req.u.rw.seg[i].gref, 0, 0UL);
744}
745
746static irqreturn_t blkif_interrupt(int irq, void *dev_id)
747{
748    struct request *req;
749    struct blkif_response *bret;
750    RING_IDX i, rp;
751    unsigned long flags;
752    struct blkfront_info *info = (struct blkfront_info *)dev_id;
753    int error;
754
755    spin_lock_irqsave(&info->io_lock, flags);
756
757    if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
758        spin_unlock_irqrestore(&info->io_lock, flags);
759        return IRQ_HANDLED;
760    }
761
762 again:
763    rp = info->ring.sring->rsp_prod;
764    rmb(); /* Ensure we see queued responses up to 'rp'. */
765
766    for (i = info->ring.rsp_cons; i != rp; i++) {
767        unsigned long id;
768
769        bret = RING_GET_RESPONSE(&info->ring, i);
770        id = bret->id;
771        /*
772         * The backend has messed up and given us an id that we would
773         * never have given to it (we stamp it up to BLK_RING_SIZE -
774         * look in get_id_from_freelist.
775         */
776        if (id >= BLK_RING_SIZE) {
777            WARN(1, "%s: response to %s has incorrect id (%ld)\n",
778                 info->gd->disk_name, op_name(bret->operation), id);
779            /* We can't safely get the 'struct request' as
780             * the id is busted. */
781            continue;
782        }
783        req = info->shadow[id].request;
784
785        if (bret->operation != BLKIF_OP_DISCARD)
786            blkif_completion(&info->shadow[id]);
787
788        if (add_id_to_freelist(info, id)) {
789            WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
790                 info->gd->disk_name, op_name(bret->operation), id);
791            continue;
792        }
793
794        error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
795        switch (bret->operation) {
796        case BLKIF_OP_DISCARD:
797            if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
798                struct request_queue *rq = info->rq;
799                printk(KERN_WARNING "blkfront: %s: %s op failed\n",
800                       info->gd->disk_name, op_name(bret->operation));
801                error = -EOPNOTSUPP;
802                info->feature_discard = 0;
803                info->feature_secdiscard = 0;
804                queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
805                queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
806            }
807            __blk_end_request_all(req, error);
808            break;
809        case BLKIF_OP_FLUSH_DISKCACHE:
810        case BLKIF_OP_WRITE_BARRIER:
811            if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
812                printk(KERN_WARNING "blkfront: %s: %s op failed\n",
813                       info->gd->disk_name, op_name(bret->operation));
814                error = -EOPNOTSUPP;
815            }
816            if (unlikely(bret->status == BLKIF_RSP_ERROR &&
817                     info->shadow[id].req.u.rw.nr_segments == 0)) {
818                printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
819                       info->gd->disk_name, op_name(bret->operation));
820                error = -EOPNOTSUPP;
821            }
822            if (unlikely(error)) {
823                if (error == -EOPNOTSUPP)
824                    error = 0;
825                info->feature_flush = 0;
826                info->flush_op = 0;
827                xlvbd_flush(info);
828            }
829            /* fall through */
830        case BLKIF_OP_READ:
831        case BLKIF_OP_WRITE:
832            if (unlikely(bret->status != BLKIF_RSP_OKAY))
833                dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
834                    "request: %x\n", bret->status);
835
836            __blk_end_request_all(req, error);
837            break;
838        default:
839            BUG();
840        }
841    }
842
843    info->ring.rsp_cons = i;
844
845    if (i != info->ring.req_prod_pvt) {
846        int more_to_do;
847        RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
848        if (more_to_do)
849            goto again;
850    } else
851        info->ring.sring->rsp_event = i + 1;
852
853    kick_pending_request_queues(info);
854
855    spin_unlock_irqrestore(&info->io_lock, flags);
856
857    return IRQ_HANDLED;
858}
859
860
861static int setup_blkring(struct xenbus_device *dev,
862             struct blkfront_info *info)
863{
864    struct blkif_sring *sring;
865    int err;
866
867    info->ring_ref = GRANT_INVALID_REF;
868
869    sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
870    if (!sring) {
871        xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
872        return -ENOMEM;
873    }
874    SHARED_RING_INIT(sring);
875    FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
876
877    sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
878
879    err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
880    if (err < 0) {
881        free_page((unsigned long)sring);
882        info->ring.sring = NULL;
883        goto fail;
884    }
885    info->ring_ref = err;
886
887    err = xenbus_alloc_evtchn(dev, &info->evtchn);
888    if (err)
889        goto fail;
890
891    err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
892                    "blkif", info);
893    if (err <= 0) {
894        xenbus_dev_fatal(dev, err,
895                 "bind_evtchn_to_irqhandler failed");
896        goto fail;
897    }
898    info->irq = err;
899
900    return 0;
901fail:
902    blkif_free(info, 0);
903    return err;
904}
905
906
907/* Common code used when first setting up, and when resuming. */
908static int talk_to_blkback(struct xenbus_device *dev,
909               struct blkfront_info *info)
910{
911    const char *message = NULL;
912    struct xenbus_transaction xbt;
913    int err;
914
915    /* Create shared ring, alloc event channel. */
916    err = setup_blkring(dev, info);
917    if (err)
918        goto out;
919
920again:
921    err = xenbus_transaction_start(&xbt);
922    if (err) {
923        xenbus_dev_fatal(dev, err, "starting transaction");
924        goto destroy_blkring;
925    }
926
927    err = xenbus_printf(xbt, dev->nodename,
928                "ring-ref", "%u", info->ring_ref);
929    if (err) {
930        message = "writing ring-ref";
931        goto abort_transaction;
932    }
933    err = xenbus_printf(xbt, dev->nodename,
934                "event-channel", "%u", info->evtchn);
935    if (err) {
936        message = "writing event-channel";
937        goto abort_transaction;
938    }
939    err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
940                XEN_IO_PROTO_ABI_NATIVE);
941    if (err) {
942        message = "writing protocol";
943        goto abort_transaction;
944    }
945
946    err = xenbus_transaction_end(xbt, 0);
947    if (err) {
948        if (err == -EAGAIN)
949            goto again;
950        xenbus_dev_fatal(dev, err, "completing transaction");
951        goto destroy_blkring;
952    }
953
954    xenbus_switch_state(dev, XenbusStateInitialised);
955
956    return 0;
957
958 abort_transaction:
959    xenbus_transaction_end(xbt, 1);
960    if (message)
961        xenbus_dev_fatal(dev, err, "%s", message);
962 destroy_blkring:
963    blkif_free(info, 0);
964 out:
965    return err;
966}
967
968/**
969 * Entry point to this code when a new device is created. Allocate the basic
970 * structures and the ring buffer for communication with the backend, and
971 * inform the backend of the appropriate details for those. Switch to
972 * Initialised state.
973 */
974static int blkfront_probe(struct xenbus_device *dev,
975              const struct xenbus_device_id *id)
976{
977    int err, vdevice, i;
978    struct blkfront_info *info;
979
980    /* FIXME: Use dynamic device id if this is not set. */
981    err = xenbus_scanf(XBT_NIL, dev->nodename,
982               "virtual-device", "%i", &vdevice);
983    if (err != 1) {
984        /* go looking in the extended area instead */
985        err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
986                   "%i", &vdevice);
987        if (err != 1) {
988            xenbus_dev_fatal(dev, err, "reading virtual-device");
989            return err;
990        }
991    }
992
993    if (xen_hvm_domain()) {
994        char *type;
995        int len;
996        /* no unplug has been done: do not hook devices != xen vbds */
997        if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
998            int major;
999
1000            if (!VDEV_IS_EXTENDED(vdevice))
1001                major = BLKIF_MAJOR(vdevice);
1002            else
1003                major = XENVBD_MAJOR;
1004
1005            if (major != XENVBD_MAJOR) {
1006                printk(KERN_INFO
1007                        "%s: HVM does not support vbd %d as xen block device\n",
1008                        __FUNCTION__, vdevice);
1009                return -ENODEV;
1010            }
1011        }
1012        /* do not create a PV cdrom device if we are an HVM guest */
1013        type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
1014        if (IS_ERR(type))
1015            return -ENODEV;
1016        if (strncmp(type, "cdrom", 5) == 0) {
1017            kfree(type);
1018            return -ENODEV;
1019        }
1020        kfree(type);
1021    }
1022    info = kzalloc(sizeof(*info), GFP_KERNEL);
1023    if (!info) {
1024        xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
1025        return -ENOMEM;
1026    }
1027
1028    mutex_init(&info->mutex);
1029    spin_lock_init(&info->io_lock);
1030    info->xbdev = dev;
1031    info->vdevice = vdevice;
1032    info->connected = BLKIF_STATE_DISCONNECTED;
1033    INIT_WORK(&info->work, blkif_restart_queue);
1034
1035    for (i = 0; i < BLK_RING_SIZE; i++)
1036        info->shadow[i].req.u.rw.id = i+1;
1037    info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1038
1039    /* Front end dir is a number, which is used as the id. */
1040    info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
1041    dev_set_drvdata(&dev->dev, info);
1042
1043    err = talk_to_blkback(dev, info);
1044    if (err) {
1045        kfree(info);
1046        dev_set_drvdata(&dev->dev, NULL);
1047        return err;
1048    }
1049
1050    return 0;
1051}
1052
1053
1054static int blkif_recover(struct blkfront_info *info)
1055{
1056    int i;
1057    struct blkif_request *req;
1058    struct blk_shadow *copy;
1059    int j;
1060
1061    /* Stage 1: Make a safe copy of the shadow state. */
1062    copy = kmalloc(sizeof(info->shadow),
1063               GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
1064    if (!copy)
1065        return -ENOMEM;
1066    memcpy(copy, info->shadow, sizeof(info->shadow));
1067
1068    /* Stage 2: Set up free list. */
1069    memset(&info->shadow, 0, sizeof(info->shadow));
1070    for (i = 0; i < BLK_RING_SIZE; i++)
1071        info->shadow[i].req.u.rw.id = i+1;
1072    info->shadow_free = info->ring.req_prod_pvt;
1073    info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
1074
1075    /* Stage 3: Find pending requests and requeue them. */
1076    for (i = 0; i < BLK_RING_SIZE; i++) {
1077        /* Not in use? */
1078        if (!copy[i].request)
1079            continue;
1080
1081        /* Grab a request slot and copy shadow state into it. */
1082        req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
1083        *req = copy[i].req;
1084
1085        /* We get a new request id, and must reset the shadow state. */
1086        req->u.rw.id = get_id_from_freelist(info);
1087        memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
1088
1089        if (req->operation != BLKIF_OP_DISCARD) {
1090        /* Rewrite any grant references invalidated by susp/resume. */
1091            for (j = 0; j < req->u.rw.nr_segments; j++)
1092                gnttab_grant_foreign_access_ref(
1093                    req->u.rw.seg[j].gref,
1094                    info->xbdev->otherend_id,
1095                    pfn_to_mfn(info->shadow[req->u.rw.id].frame[j]),
1096                    rq_data_dir(info->shadow[req->u.rw.id].request));
1097        }
1098        info->shadow[req->u.rw.id].req = *req;
1099
1100        info->ring.req_prod_pvt++;
1101    }
1102
1103    kfree(copy);
1104
1105    xenbus_switch_state(info->xbdev, XenbusStateConnected);
1106
1107    spin_lock_irq(&info->io_lock);
1108
1109    /* Now safe for us to use the shared ring */
1110    info->connected = BLKIF_STATE_CONNECTED;
1111
1112    /* Send off requeued requests */
1113    flush_requests(info);
1114
1115    /* Kick any other new requests queued since we resumed */
1116    kick_pending_request_queues(info);
1117
1118    spin_unlock_irq(&info->io_lock);
1119
1120    return 0;
1121}
1122
1123/**
1124 * We are reconnecting to the backend, due to a suspend/resume, or a backend
1125 * driver restart. We tear down our blkif structure and recreate it, but
1126 * leave the device-layer structures intact so that this is transparent to the
1127 * rest of the kernel.
1128 */
1129static int blkfront_resume(struct xenbus_device *dev)
1130{
1131    struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1132    int err;
1133
1134    dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
1135
1136    blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
1137
1138    err = talk_to_blkback(dev, info);
1139    if (info->connected == BLKIF_STATE_SUSPENDED && !err)
1140        err = blkif_recover(info);
1141
1142    return err;
1143}
1144
1145static void
1146blkfront_closing(struct blkfront_info *info)
1147{
1148    struct xenbus_device *xbdev = info->xbdev;
1149    struct block_device *bdev = NULL;
1150
1151    mutex_lock(&info->mutex);
1152
1153    if (xbdev->state == XenbusStateClosing) {
1154        mutex_unlock(&info->mutex);
1155        return;
1156    }
1157
1158    if (info->gd)
1159        bdev = bdget_disk(info->gd, 0);
1160
1161    mutex_unlock(&info->mutex);
1162
1163    if (!bdev) {
1164        xenbus_frontend_closed(xbdev);
1165        return;
1166    }
1167
1168    mutex_lock(&bdev->bd_mutex);
1169
1170    if (bdev->bd_openers) {
1171        xenbus_dev_error(xbdev, -EBUSY,
1172                 "Device in use; refusing to close");
1173        xenbus_switch_state(xbdev, XenbusStateClosing);
1174    } else {
1175        xlvbd_release_gendisk(info);
1176        xenbus_frontend_closed(xbdev);
1177    }
1178
1179    mutex_unlock(&bdev->bd_mutex);
1180    bdput(bdev);
1181}
1182
1183static void blkfront_setup_discard(struct blkfront_info *info)
1184{
1185    int err;
1186    char *type;
1187    unsigned int discard_granularity;
1188    unsigned int discard_alignment;
1189    unsigned int discard_secure;
1190
1191    type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
1192    if (IS_ERR(type))
1193        return;
1194
1195    info->feature_secdiscard = 0;
1196    if (strncmp(type, "phy", 3) == 0) {
1197        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1198            "discard-granularity", "%u", &discard_granularity,
1199            "discard-alignment", "%u", &discard_alignment,
1200            NULL);
1201        if (!err) {
1202            info->feature_discard = 1;
1203            info->discard_granularity = discard_granularity;
1204            info->discard_alignment = discard_alignment;
1205        }
1206        err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1207                "discard-secure", "%d", &discard_secure,
1208                NULL);
1209        if (!err)
1210            info->feature_secdiscard = discard_secure;
1211
1212    } else if (strncmp(type, "file", 4) == 0)
1213        info->feature_discard = 1;
1214
1215    kfree(type);
1216}
1217
1218/*
1219 * Invoked when the backend is finally 'ready' (and has told produced
1220 * the details about the physical device - #sectors, size, etc).
1221 */
1222static void blkfront_connect(struct blkfront_info *info)
1223{
1224    unsigned long long sectors;
1225    unsigned long sector_size;
1226    unsigned int binfo;
1227    int err;
1228    int barrier, flush, discard;
1229
1230    switch (info->connected) {
1231    case BLKIF_STATE_CONNECTED:
1232        /*
1233         * Potentially, the back-end may be signalling
1234         * a capacity change; update the capacity.
1235         */
1236        err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
1237                   "sectors", "%Lu", &sectors);
1238        if (XENBUS_EXIST_ERR(err))
1239            return;
1240        printk(KERN_INFO "Setting capacity to %Lu\n",
1241               sectors);
1242        set_capacity(info->gd, sectors);
1243        revalidate_disk(info->gd);
1244
1245        /* fall through */
1246    case BLKIF_STATE_SUSPENDED:
1247        return;
1248
1249    default:
1250        break;
1251    }
1252
1253    dev_dbg(&info->xbdev->dev, "%s:%s.\n",
1254        __func__, info->xbdev->otherend);
1255
1256    err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1257                "sectors", "%llu", &sectors,
1258                "info", "%u", &binfo,
1259                "sector-size", "%lu", &sector_size,
1260                NULL);
1261    if (err) {
1262        xenbus_dev_fatal(info->xbdev, err,
1263                 "reading backend fields at %s",
1264                 info->xbdev->otherend);
1265        return;
1266    }
1267
1268    info->feature_flush = 0;
1269    info->flush_op = 0;
1270
1271    err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1272                "feature-barrier", "%d", &barrier,
1273                NULL);
1274
1275    /*
1276     * If there's no "feature-barrier" defined, then it means
1277     * we're dealing with a very old backend which writes
1278     * synchronously; nothing to do.
1279     *
1280     * If there are barriers, then we use flush.
1281     */
1282    if (!err && barrier) {
1283        info->feature_flush = REQ_FLUSH | REQ_FUA;
1284        info->flush_op = BLKIF_OP_WRITE_BARRIER;
1285    }
1286    /*
1287     * And if there is "feature-flush-cache" use that above
1288     * barriers.
1289     */
1290    err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1291                "feature-flush-cache", "%d", &flush,
1292                NULL);
1293
1294    if (!err && flush) {
1295        info->feature_flush = REQ_FLUSH;
1296        info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
1297    }
1298
1299    err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
1300                "feature-discard", "%d", &discard,
1301                NULL);
1302
1303    if (!err && discard)
1304        blkfront_setup_discard(info);
1305
1306    err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
1307    if (err) {
1308        xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
1309                 info->xbdev->otherend);
1310        return;
1311    }
1312
1313    xenbus_switch_state(info->xbdev, XenbusStateConnected);
1314
1315    /* Kick pending requests. */
1316    spin_lock_irq(&info->io_lock);
1317    info->connected = BLKIF_STATE_CONNECTED;
1318    kick_pending_request_queues(info);
1319    spin_unlock_irq(&info->io_lock);
1320
1321    add_disk(info->gd);
1322
1323    info->is_ready = 1;
1324}
1325
1326/**
1327 * Callback received when the backend's state changes.
1328 */
1329static void blkback_changed(struct xenbus_device *dev,
1330                enum xenbus_state backend_state)
1331{
1332    struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1333
1334    dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
1335
1336    switch (backend_state) {
1337    case XenbusStateInitialising:
1338    case XenbusStateInitWait:
1339    case XenbusStateInitialised:
1340    case XenbusStateReconfiguring:
1341    case XenbusStateReconfigured:
1342    case XenbusStateUnknown:
1343    case XenbusStateClosed:
1344        break;
1345
1346    case XenbusStateConnected:
1347        blkfront_connect(info);
1348        break;
1349
1350    case XenbusStateClosing:
1351        blkfront_closing(info);
1352        break;
1353    }
1354}
1355
1356static int blkfront_remove(struct xenbus_device *xbdev)
1357{
1358    struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
1359    struct block_device *bdev = NULL;
1360    struct gendisk *disk;
1361
1362    dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
1363
1364    blkif_free(info, 0);
1365
1366    mutex_lock(&info->mutex);
1367
1368    disk = info->gd;
1369    if (disk)
1370        bdev = bdget_disk(disk, 0);
1371
1372    info->xbdev = NULL;
1373    mutex_unlock(&info->mutex);
1374
1375    if (!bdev) {
1376        kfree(info);
1377        return 0;
1378    }
1379
1380    /*
1381     * The xbdev was removed before we reached the Closed
1382     * state. See if it's safe to remove the disk. If the bdev
1383     * isn't closed yet, we let release take care of it.
1384     */
1385
1386    mutex_lock(&bdev->bd_mutex);
1387    info = disk->private_data;
1388
1389    dev_warn(disk_to_dev(disk),
1390         "%s was hot-unplugged, %d stale handles\n",
1391         xbdev->nodename, bdev->bd_openers);
1392
1393    if (info && !bdev->bd_openers) {
1394        xlvbd_release_gendisk(info);
1395        disk->private_data = NULL;
1396        kfree(info);
1397    }
1398
1399    mutex_unlock(&bdev->bd_mutex);
1400    bdput(bdev);
1401
1402    return 0;
1403}
1404
1405static int blkfront_is_ready(struct xenbus_device *dev)
1406{
1407    struct blkfront_info *info = dev_get_drvdata(&dev->dev);
1408
1409    return info->is_ready && info->xbdev;
1410}
1411
1412static int blkif_open(struct block_device *bdev, fmode_t mode)
1413{
1414    struct gendisk *disk = bdev->bd_disk;
1415    struct blkfront_info *info;
1416    int err = 0;
1417
1418    mutex_lock(&blkfront_mutex);
1419
1420    info = disk->private_data;
1421    if (!info) {
1422        /* xbdev gone */
1423        err = -ERESTARTSYS;
1424        goto out;
1425    }
1426
1427    mutex_lock(&info->mutex);
1428
1429    if (!info->gd)
1430        /* xbdev is closed */
1431        err = -ERESTARTSYS;
1432
1433    mutex_unlock(&info->mutex);
1434
1435out:
1436    mutex_unlock(&blkfront_mutex);
1437    return err;
1438}
1439
1440static int blkif_release(struct gendisk *disk, fmode_t mode)
1441{
1442    struct blkfront_info *info = disk->private_data;
1443    struct block_device *bdev;
1444    struct xenbus_device *xbdev;
1445
1446    mutex_lock(&blkfront_mutex);
1447
1448    bdev = bdget_disk(disk, 0);
1449
1450    if (bdev->bd_openers)
1451        goto out;
1452
1453    /*
1454     * Check if we have been instructed to close. We will have
1455     * deferred this request, because the bdev was still open.
1456     */
1457
1458    mutex_lock(&info->mutex);
1459    xbdev = info->xbdev;
1460
1461    if (xbdev && xbdev->state == XenbusStateClosing) {
1462        /* pending switch to state closed */
1463        dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
1464        xlvbd_release_gendisk(info);
1465        xenbus_frontend_closed(info->xbdev);
1466     }
1467
1468    mutex_unlock(&info->mutex);
1469
1470    if (!xbdev) {
1471        /* sudden device removal */
1472        dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
1473        xlvbd_release_gendisk(info);
1474        disk->private_data = NULL;
1475        kfree(info);
1476    }
1477
1478out:
1479    bdput(bdev);
1480    mutex_unlock(&blkfront_mutex);
1481    return 0;
1482}
1483
1484static const struct block_device_operations xlvbd_block_fops =
1485{
1486    .owner = THIS_MODULE,
1487    .open = blkif_open,
1488    .release = blkif_release,
1489    .getgeo = blkif_getgeo,
1490    .ioctl = blkif_ioctl,
1491};
1492
1493
1494static const struct xenbus_device_id blkfront_ids[] = {
1495    { "vbd" },
1496    { "" }
1497};
1498
1499static DEFINE_XENBUS_DRIVER(blkfront, ,
1500    .probe = blkfront_probe,
1501    .remove = blkfront_remove,
1502    .resume = blkfront_resume,
1503    .otherend_changed = blkback_changed,
1504    .is_ready = blkfront_is_ready,
1505);
1506
1507static int __init xlblk_init(void)
1508{
1509    int ret;
1510
1511    if (!xen_domain())
1512        return -ENODEV;
1513
1514    if (xen_hvm_domain() && !xen_platform_pci_unplug)
1515        return -ENODEV;
1516
1517    if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
1518        printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
1519               XENVBD_MAJOR, DEV_NAME);
1520        return -ENODEV;
1521    }
1522
1523    ret = xenbus_register_frontend(&blkfront_driver);
1524    if (ret) {
1525        unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
1526        return ret;
1527    }
1528
1529    return 0;
1530}
1531module_init(xlblk_init);
1532
1533
1534static void __exit xlblk_exit(void)
1535{
1536    xenbus_unregister_driver(&blkfront_driver);
1537    unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
1538    kfree(minors);
1539}
1540module_exit(xlblk_exit);
1541
1542MODULE_DESCRIPTION("Xen virtual block device frontend");
1543MODULE_LICENSE("GPL");
1544MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
1545MODULE_ALIAS("xen:vbd");
1546MODULE_ALIAS("xenblk");
1547

Archive Download this file



interactive