Root/kernel/kexec.c

1/*
2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/capability.h>
10#include <linux/mm.h>
11#include <linux/file.h>
12#include <linux/slab.h>
13#include <linux/fs.h>
14#include <linux/kexec.h>
15#include <linux/mutex.h>
16#include <linux/list.h>
17#include <linux/highmem.h>
18#include <linux/syscalls.h>
19#include <linux/reboot.h>
20#include <linux/ioport.h>
21#include <linux/hardirq.h>
22#include <linux/elf.h>
23#include <linux/elfcore.h>
24#include <linux/utsname.h>
25#include <linux/numa.h>
26#include <linux/suspend.h>
27#include <linux/device.h>
28#include <linux/freezer.h>
29#include <linux/pm.h>
30#include <linux/cpu.h>
31#include <linux/console.h>
32#include <linux/vmalloc.h>
33#include <linux/swap.h>
34#include <linux/syscore_ops.h>
35#include <linux/compiler.h>
36
37#include <asm/page.h>
38#include <asm/uaccess.h>
39#include <asm/io.h>
40#include <asm/sections.h>
41
42/* Per cpu memory for storing cpu states in case of system crash. */
43note_buf_t __percpu *crash_notes;
44
45/* vmcoreinfo stuff */
46static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
47u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
48size_t vmcoreinfo_size;
49size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
50
51/* Flag to indicate we are going to kexec a new kernel */
52bool kexec_in_progress = false;
53
54/* Location of the reserved area for the crash kernel */
55struct resource crashk_res = {
56    .name = "Crash kernel",
57    .start = 0,
58    .end = 0,
59    .flags = IORESOURCE_BUSY | IORESOURCE_MEM
60};
61struct resource crashk_low_res = {
62    .name = "Crash kernel",
63    .start = 0,
64    .end = 0,
65    .flags = IORESOURCE_BUSY | IORESOURCE_MEM
66};
67
68int kexec_should_crash(struct task_struct *p)
69{
70    if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
71        return 1;
72    return 0;
73}
74
75/*
76 * When kexec transitions to the new kernel there is a one-to-one
77 * mapping between physical and virtual addresses. On processors
78 * where you can disable the MMU this is trivial, and easy. For
79 * others it is still a simple predictable page table to setup.
80 *
81 * In that environment kexec copies the new kernel to its final
82 * resting place. This means I can only support memory whose
83 * physical address can fit in an unsigned long. In particular
84 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
85 * If the assembly stub has more restrictive requirements
86 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
87 * defined more restrictively in <asm/kexec.h>.
88 *
89 * The code for the transition from the current kernel to the
90 * the new kernel is placed in the control_code_buffer, whose size
91 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
92 * page of memory is necessary, but some architectures require more.
93 * Because this memory must be identity mapped in the transition from
94 * virtual to physical addresses it must live in the range
95 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
96 * modifiable.
97 *
98 * The assembly stub in the control code buffer is passed a linked list
99 * of descriptor pages detailing the source pages of the new kernel,
100 * and the destination addresses of those source pages. As this data
101 * structure is not used in the context of the current OS, it must
102 * be self-contained.
103 *
104 * The code has been made to work with highmem pages and will use a
105 * destination page in its final resting place (if it happens
106 * to allocate it). The end product of this is that most of the
107 * physical address space, and most of RAM can be used.
108 *
109 * Future directions include:
110 * - allocating a page table with the control code buffer identity
111 * mapped, to simplify machine_kexec and make kexec_on_panic more
112 * reliable.
113 */
114
115/*
116 * KIMAGE_NO_DEST is an impossible destination address..., for
117 * allocating pages whose destination address we do not care about.
118 */
119#define KIMAGE_NO_DEST (-1UL)
120
121static int kimage_is_destination_range(struct kimage *image,
122                       unsigned long start, unsigned long end);
123static struct page *kimage_alloc_page(struct kimage *image,
124                       gfp_t gfp_mask,
125                       unsigned long dest);
126
127static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
128                        unsigned long nr_segments,
129                            struct kexec_segment __user *segments)
130{
131    size_t segment_bytes;
132    struct kimage *image;
133    unsigned long i;
134    int result;
135
136    /* Allocate a controlling structure */
137    result = -ENOMEM;
138    image = kzalloc(sizeof(*image), GFP_KERNEL);
139    if (!image)
140        goto out;
141
142    image->head = 0;
143    image->entry = &image->head;
144    image->last_entry = &image->head;
145    image->control_page = ~0; /* By default this does not apply */
146    image->start = entry;
147    image->type = KEXEC_TYPE_DEFAULT;
148
149    /* Initialize the list of control pages */
150    INIT_LIST_HEAD(&image->control_pages);
151
152    /* Initialize the list of destination pages */
153    INIT_LIST_HEAD(&image->dest_pages);
154
155    /* Initialize the list of unusable pages */
156    INIT_LIST_HEAD(&image->unuseable_pages);
157
158    /* Read in the segments */
159    image->nr_segments = nr_segments;
160    segment_bytes = nr_segments * sizeof(*segments);
161    result = copy_from_user(image->segment, segments, segment_bytes);
162    if (result) {
163        result = -EFAULT;
164        goto out;
165    }
166
167    /*
168     * Verify we have good destination addresses. The caller is
169     * responsible for making certain we don't attempt to load
170     * the new image into invalid or reserved areas of RAM. This
171     * just verifies it is an address we can use.
172     *
173     * Since the kernel does everything in page size chunks ensure
174     * the destination addresses are page aligned. Too many
175     * special cases crop of when we don't do this. The most
176     * insidious is getting overlapping destination addresses
177     * simply because addresses are changed to page size
178     * granularity.
179     */
180    result = -EADDRNOTAVAIL;
181    for (i = 0; i < nr_segments; i++) {
182        unsigned long mstart, mend;
183
184        mstart = image->segment[i].mem;
185        mend = mstart + image->segment[i].memsz;
186        if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
187            goto out;
188        if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
189            goto out;
190    }
191
192    /* Verify our destination addresses do not overlap.
193     * If we alloed overlapping destination addresses
194     * through very weird things can happen with no
195     * easy explanation as one segment stops on another.
196     */
197    result = -EINVAL;
198    for (i = 0; i < nr_segments; i++) {
199        unsigned long mstart, mend;
200        unsigned long j;
201
202        mstart = image->segment[i].mem;
203        mend = mstart + image->segment[i].memsz;
204        for (j = 0; j < i; j++) {
205            unsigned long pstart, pend;
206            pstart = image->segment[j].mem;
207            pend = pstart + image->segment[j].memsz;
208            /* Do the segments overlap ? */
209            if ((mend > pstart) && (mstart < pend))
210                goto out;
211        }
212    }
213
214    /* Ensure our buffer sizes are strictly less than
215     * our memory sizes. This should always be the case,
216     * and it is easier to check up front than to be surprised
217     * later on.
218     */
219    result = -EINVAL;
220    for (i = 0; i < nr_segments; i++) {
221        if (image->segment[i].bufsz > image->segment[i].memsz)
222            goto out;
223    }
224
225    result = 0;
226out:
227    if (result == 0)
228        *rimage = image;
229    else
230        kfree(image);
231
232    return result;
233
234}
235
236static void kimage_free_page_list(struct list_head *list);
237
238static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
239                unsigned long nr_segments,
240                struct kexec_segment __user *segments)
241{
242    int result;
243    struct kimage *image;
244
245    /* Allocate and initialize a controlling structure */
246    image = NULL;
247    result = do_kimage_alloc(&image, entry, nr_segments, segments);
248    if (result)
249        goto out;
250
251    /*
252     * Find a location for the control code buffer, and add it
253     * the vector of segments so that it's pages will also be
254     * counted as destination pages.
255     */
256    result = -ENOMEM;
257    image->control_code_page = kimage_alloc_control_pages(image,
258                       get_order(KEXEC_CONTROL_PAGE_SIZE));
259    if (!image->control_code_page) {
260        printk(KERN_ERR "Could not allocate control_code_buffer\n");
261        goto out_free;
262    }
263
264    image->swap_page = kimage_alloc_control_pages(image, 0);
265    if (!image->swap_page) {
266        printk(KERN_ERR "Could not allocate swap buffer\n");
267        goto out_free;
268    }
269
270    *rimage = image;
271    return 0;
272
273out_free:
274    kimage_free_page_list(&image->control_pages);
275    kfree(image);
276out:
277    return result;
278}
279
280static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
281                unsigned long nr_segments,
282                struct kexec_segment __user *segments)
283{
284    int result;
285    struct kimage *image;
286    unsigned long i;
287
288    image = NULL;
289    /* Verify we have a valid entry point */
290    if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
291        result = -EADDRNOTAVAIL;
292        goto out;
293    }
294
295    /* Allocate and initialize a controlling structure */
296    result = do_kimage_alloc(&image, entry, nr_segments, segments);
297    if (result)
298        goto out;
299
300    /* Enable the special crash kernel control page
301     * allocation policy.
302     */
303    image->control_page = crashk_res.start;
304    image->type = KEXEC_TYPE_CRASH;
305
306    /*
307     * Verify we have good destination addresses. Normally
308     * the caller is responsible for making certain we don't
309     * attempt to load the new image into invalid or reserved
310     * areas of RAM. But crash kernels are preloaded into a
311     * reserved area of ram. We must ensure the addresses
312     * are in the reserved area otherwise preloading the
313     * kernel could corrupt things.
314     */
315    result = -EADDRNOTAVAIL;
316    for (i = 0; i < nr_segments; i++) {
317        unsigned long mstart, mend;
318
319        mstart = image->segment[i].mem;
320        mend = mstart + image->segment[i].memsz - 1;
321        /* Ensure we are within the crash kernel limits */
322        if ((mstart < crashk_res.start) || (mend > crashk_res.end))
323            goto out_free;
324    }
325
326    /*
327     * Find a location for the control code buffer, and add
328     * the vector of segments so that it's pages will also be
329     * counted as destination pages.
330     */
331    result = -ENOMEM;
332    image->control_code_page = kimage_alloc_control_pages(image,
333                       get_order(KEXEC_CONTROL_PAGE_SIZE));
334    if (!image->control_code_page) {
335        printk(KERN_ERR "Could not allocate control_code_buffer\n");
336        goto out_free;
337    }
338
339    *rimage = image;
340    return 0;
341
342out_free:
343    kfree(image);
344out:
345    return result;
346}
347
348static int kimage_is_destination_range(struct kimage *image,
349                    unsigned long start,
350                    unsigned long end)
351{
352    unsigned long i;
353
354    for (i = 0; i < image->nr_segments; i++) {
355        unsigned long mstart, mend;
356
357        mstart = image->segment[i].mem;
358        mend = mstart + image->segment[i].memsz;
359        if ((end > mstart) && (start < mend))
360            return 1;
361    }
362
363    return 0;
364}
365
366static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
367{
368    struct page *pages;
369
370    pages = alloc_pages(gfp_mask, order);
371    if (pages) {
372        unsigned int count, i;
373        pages->mapping = NULL;
374        set_page_private(pages, order);
375        count = 1 << order;
376        for (i = 0; i < count; i++)
377            SetPageReserved(pages + i);
378    }
379
380    return pages;
381}
382
383static void kimage_free_pages(struct page *page)
384{
385    unsigned int order, count, i;
386
387    order = page_private(page);
388    count = 1 << order;
389    for (i = 0; i < count; i++)
390        ClearPageReserved(page + i);
391    __free_pages(page, order);
392}
393
394static void kimage_free_page_list(struct list_head *list)
395{
396    struct list_head *pos, *next;
397
398    list_for_each_safe(pos, next, list) {
399        struct page *page;
400
401        page = list_entry(pos, struct page, lru);
402        list_del(&page->lru);
403        kimage_free_pages(page);
404    }
405}
406
407static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
408                            unsigned int order)
409{
410    /* Control pages are special, they are the intermediaries
411     * that are needed while we copy the rest of the pages
412     * to their final resting place. As such they must
413     * not conflict with either the destination addresses
414     * or memory the kernel is already using.
415     *
416     * The only case where we really need more than one of
417     * these are for architectures where we cannot disable
418     * the MMU and must instead generate an identity mapped
419     * page table for all of the memory.
420     *
421     * At worst this runs in O(N) of the image size.
422     */
423    struct list_head extra_pages;
424    struct page *pages;
425    unsigned int count;
426
427    count = 1 << order;
428    INIT_LIST_HEAD(&extra_pages);
429
430    /* Loop while I can allocate a page and the page allocated
431     * is a destination page.
432     */
433    do {
434        unsigned long pfn, epfn, addr, eaddr;
435
436        pages = kimage_alloc_pages(GFP_KERNEL, order);
437        if (!pages)
438            break;
439        pfn = page_to_pfn(pages);
440        epfn = pfn + count;
441        addr = pfn << PAGE_SHIFT;
442        eaddr = epfn << PAGE_SHIFT;
443        if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
444                  kimage_is_destination_range(image, addr, eaddr)) {
445            list_add(&pages->lru, &extra_pages);
446            pages = NULL;
447        }
448    } while (!pages);
449
450    if (pages) {
451        /* Remember the allocated page... */
452        list_add(&pages->lru, &image->control_pages);
453
454        /* Because the page is already in it's destination
455         * location we will never allocate another page at
456         * that address. Therefore kimage_alloc_pages
457         * will not return it (again) and we don't need
458         * to give it an entry in image->segment[].
459         */
460    }
461    /* Deal with the destination pages I have inadvertently allocated.
462     *
463     * Ideally I would convert multi-page allocations into single
464     * page allocations, and add everything to image->dest_pages.
465     *
466     * For now it is simpler to just free the pages.
467     */
468    kimage_free_page_list(&extra_pages);
469
470    return pages;
471}
472
473static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
474                              unsigned int order)
475{
476    /* Control pages are special, they are the intermediaries
477     * that are needed while we copy the rest of the pages
478     * to their final resting place. As such they must
479     * not conflict with either the destination addresses
480     * or memory the kernel is already using.
481     *
482     * Control pages are also the only pags we must allocate
483     * when loading a crash kernel. All of the other pages
484     * are specified by the segments and we just memcpy
485     * into them directly.
486     *
487     * The only case where we really need more than one of
488     * these are for architectures where we cannot disable
489     * the MMU and must instead generate an identity mapped
490     * page table for all of the memory.
491     *
492     * Given the low demand this implements a very simple
493     * allocator that finds the first hole of the appropriate
494     * size in the reserved memory region, and allocates all
495     * of the memory up to and including the hole.
496     */
497    unsigned long hole_start, hole_end, size;
498    struct page *pages;
499
500    pages = NULL;
501    size = (1 << order) << PAGE_SHIFT;
502    hole_start = (image->control_page + (size - 1)) & ~(size - 1);
503    hole_end = hole_start + size - 1;
504    while (hole_end <= crashk_res.end) {
505        unsigned long i;
506
507        if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
508            break;
509        /* See if I overlap any of the segments */
510        for (i = 0; i < image->nr_segments; i++) {
511            unsigned long mstart, mend;
512
513            mstart = image->segment[i].mem;
514            mend = mstart + image->segment[i].memsz - 1;
515            if ((hole_end >= mstart) && (hole_start <= mend)) {
516                /* Advance the hole to the end of the segment */
517                hole_start = (mend + (size - 1)) & ~(size - 1);
518                hole_end = hole_start + size - 1;
519                break;
520            }
521        }
522        /* If I don't overlap any segments I have found my hole! */
523        if (i == image->nr_segments) {
524            pages = pfn_to_page(hole_start >> PAGE_SHIFT);
525            break;
526        }
527    }
528    if (pages)
529        image->control_page = hole_end;
530
531    return pages;
532}
533
534
535struct page *kimage_alloc_control_pages(struct kimage *image,
536                     unsigned int order)
537{
538    struct page *pages = NULL;
539
540    switch (image->type) {
541    case KEXEC_TYPE_DEFAULT:
542        pages = kimage_alloc_normal_control_pages(image, order);
543        break;
544    case KEXEC_TYPE_CRASH:
545        pages = kimage_alloc_crash_control_pages(image, order);
546        break;
547    }
548
549    return pages;
550}
551
552static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
553{
554    if (*image->entry != 0)
555        image->entry++;
556
557    if (image->entry == image->last_entry) {
558        kimage_entry_t *ind_page;
559        struct page *page;
560
561        page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
562        if (!page)
563            return -ENOMEM;
564
565        ind_page = page_address(page);
566        *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
567        image->entry = ind_page;
568        image->last_entry = ind_page +
569                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
570    }
571    *image->entry = entry;
572    image->entry++;
573    *image->entry = 0;
574
575    return 0;
576}
577
578static int kimage_set_destination(struct kimage *image,
579                   unsigned long destination)
580{
581    int result;
582
583    destination &= PAGE_MASK;
584    result = kimage_add_entry(image, destination | IND_DESTINATION);
585    if (result == 0)
586        image->destination = destination;
587
588    return result;
589}
590
591
592static int kimage_add_page(struct kimage *image, unsigned long page)
593{
594    int result;
595
596    page &= PAGE_MASK;
597    result = kimage_add_entry(image, page | IND_SOURCE);
598    if (result == 0)
599        image->destination += PAGE_SIZE;
600
601    return result;
602}
603
604
605static void kimage_free_extra_pages(struct kimage *image)
606{
607    /* Walk through and free any extra destination pages I may have */
608    kimage_free_page_list(&image->dest_pages);
609
610    /* Walk through and free any unusable pages I have cached */
611    kimage_free_page_list(&image->unuseable_pages);
612
613}
614static void kimage_terminate(struct kimage *image)
615{
616    if (*image->entry != 0)
617        image->entry++;
618
619    *image->entry = IND_DONE;
620}
621
622#define for_each_kimage_entry(image, ptr, entry) \
623    for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
624        ptr = (entry & IND_INDIRECTION)? \
625            phys_to_virt((entry & PAGE_MASK)): ptr +1)
626
627static void kimage_free_entry(kimage_entry_t entry)
628{
629    struct page *page;
630
631    page = pfn_to_page(entry >> PAGE_SHIFT);
632    kimage_free_pages(page);
633}
634
635static void kimage_free(struct kimage *image)
636{
637    kimage_entry_t *ptr, entry;
638    kimage_entry_t ind = 0;
639
640    if (!image)
641        return;
642
643    kimage_free_extra_pages(image);
644    for_each_kimage_entry(image, ptr, entry) {
645        if (entry & IND_INDIRECTION) {
646            /* Free the previous indirection page */
647            if (ind & IND_INDIRECTION)
648                kimage_free_entry(ind);
649            /* Save this indirection page until we are
650             * done with it.
651             */
652            ind = entry;
653        }
654        else if (entry & IND_SOURCE)
655            kimage_free_entry(entry);
656    }
657    /* Free the final indirection page */
658    if (ind & IND_INDIRECTION)
659        kimage_free_entry(ind);
660
661    /* Handle any machine specific cleanup */
662    machine_kexec_cleanup(image);
663
664    /* Free the kexec control pages... */
665    kimage_free_page_list(&image->control_pages);
666    kfree(image);
667}
668
669static kimage_entry_t *kimage_dst_used(struct kimage *image,
670                    unsigned long page)
671{
672    kimage_entry_t *ptr, entry;
673    unsigned long destination = 0;
674
675    for_each_kimage_entry(image, ptr, entry) {
676        if (entry & IND_DESTINATION)
677            destination = entry & PAGE_MASK;
678        else if (entry & IND_SOURCE) {
679            if (page == destination)
680                return ptr;
681            destination += PAGE_SIZE;
682        }
683    }
684
685    return NULL;
686}
687
688static struct page *kimage_alloc_page(struct kimage *image,
689                    gfp_t gfp_mask,
690                    unsigned long destination)
691{
692    /*
693     * Here we implement safeguards to ensure that a source page
694     * is not copied to its destination page before the data on
695     * the destination page is no longer useful.
696     *
697     * To do this we maintain the invariant that a source page is
698     * either its own destination page, or it is not a
699     * destination page at all.
700     *
701     * That is slightly stronger than required, but the proof
702     * that no problems will not occur is trivial, and the
703     * implementation is simply to verify.
704     *
705     * When allocating all pages normally this algorithm will run
706     * in O(N) time, but in the worst case it will run in O(N^2)
707     * time. If the runtime is a problem the data structures can
708     * be fixed.
709     */
710    struct page *page;
711    unsigned long addr;
712
713    /*
714     * Walk through the list of destination pages, and see if I
715     * have a match.
716     */
717    list_for_each_entry(page, &image->dest_pages, lru) {
718        addr = page_to_pfn(page) << PAGE_SHIFT;
719        if (addr == destination) {
720            list_del(&page->lru);
721            return page;
722        }
723    }
724    page = NULL;
725    while (1) {
726        kimage_entry_t *old;
727
728        /* Allocate a page, if we run out of memory give up */
729        page = kimage_alloc_pages(gfp_mask, 0);
730        if (!page)
731            return NULL;
732        /* If the page cannot be used file it away */
733        if (page_to_pfn(page) >
734                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
735            list_add(&page->lru, &image->unuseable_pages);
736            continue;
737        }
738        addr = page_to_pfn(page) << PAGE_SHIFT;
739
740        /* If it is the destination page we want use it */
741        if (addr == destination)
742            break;
743
744        /* If the page is not a destination page use it */
745        if (!kimage_is_destination_range(image, addr,
746                          addr + PAGE_SIZE))
747            break;
748
749        /*
750         * I know that the page is someones destination page.
751         * See if there is already a source page for this
752         * destination page. And if so swap the source pages.
753         */
754        old = kimage_dst_used(image, addr);
755        if (old) {
756            /* If so move it */
757            unsigned long old_addr;
758            struct page *old_page;
759
760            old_addr = *old & PAGE_MASK;
761            old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
762            copy_highpage(page, old_page);
763            *old = addr | (*old & ~PAGE_MASK);
764
765            /* The old page I have found cannot be a
766             * destination page, so return it if it's
767             * gfp_flags honor the ones passed in.
768             */
769            if (!(gfp_mask & __GFP_HIGHMEM) &&
770                PageHighMem(old_page)) {
771                kimage_free_pages(old_page);
772                continue;
773            }
774            addr = old_addr;
775            page = old_page;
776            break;
777        }
778        else {
779            /* Place the page on the destination list I
780             * will use it later.
781             */
782            list_add(&page->lru, &image->dest_pages);
783        }
784    }
785
786    return page;
787}
788
789static int kimage_load_normal_segment(struct kimage *image,
790                     struct kexec_segment *segment)
791{
792    unsigned long maddr;
793    size_t ubytes, mbytes;
794    int result;
795    unsigned char __user *buf;
796
797    result = 0;
798    buf = segment->buf;
799    ubytes = segment->bufsz;
800    mbytes = segment->memsz;
801    maddr = segment->mem;
802
803    result = kimage_set_destination(image, maddr);
804    if (result < 0)
805        goto out;
806
807    while (mbytes) {
808        struct page *page;
809        char *ptr;
810        size_t uchunk, mchunk;
811
812        page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
813        if (!page) {
814            result = -ENOMEM;
815            goto out;
816        }
817        result = kimage_add_page(image, page_to_pfn(page)
818                                << PAGE_SHIFT);
819        if (result < 0)
820            goto out;
821
822        ptr = kmap(page);
823        /* Start with a clear page */
824        clear_page(ptr);
825        ptr += maddr & ~PAGE_MASK;
826        mchunk = min_t(size_t, mbytes,
827                PAGE_SIZE - (maddr & ~PAGE_MASK));
828        uchunk = min(ubytes, mchunk);
829
830        result = copy_from_user(ptr, buf, uchunk);
831        kunmap(page);
832        if (result) {
833            result = -EFAULT;
834            goto out;
835        }
836        ubytes -= uchunk;
837        maddr += mchunk;
838        buf += mchunk;
839        mbytes -= mchunk;
840    }
841out:
842    return result;
843}
844
845static int kimage_load_crash_segment(struct kimage *image,
846                    struct kexec_segment *segment)
847{
848    /* For crash dumps kernels we simply copy the data from
849     * user space to it's destination.
850     * We do things a page at a time for the sake of kmap.
851     */
852    unsigned long maddr;
853    size_t ubytes, mbytes;
854    int result;
855    unsigned char __user *buf;
856
857    result = 0;
858    buf = segment->buf;
859    ubytes = segment->bufsz;
860    mbytes = segment->memsz;
861    maddr = segment->mem;
862    while (mbytes) {
863        struct page *page;
864        char *ptr;
865        size_t uchunk, mchunk;
866
867        page = pfn_to_page(maddr >> PAGE_SHIFT);
868        if (!page) {
869            result = -ENOMEM;
870            goto out;
871        }
872        ptr = kmap(page);
873        ptr += maddr & ~PAGE_MASK;
874        mchunk = min_t(size_t, mbytes,
875                PAGE_SIZE - (maddr & ~PAGE_MASK));
876        uchunk = min(ubytes, mchunk);
877        if (mchunk > uchunk) {
878            /* Zero the trailing part of the page */
879            memset(ptr + uchunk, 0, mchunk - uchunk);
880        }
881        result = copy_from_user(ptr, buf, uchunk);
882        kexec_flush_icache_page(page);
883        kunmap(page);
884        if (result) {
885            result = -EFAULT;
886            goto out;
887        }
888        ubytes -= uchunk;
889        maddr += mchunk;
890        buf += mchunk;
891        mbytes -= mchunk;
892    }
893out:
894    return result;
895}
896
897static int kimage_load_segment(struct kimage *image,
898                struct kexec_segment *segment)
899{
900    int result = -ENOMEM;
901
902    switch (image->type) {
903    case KEXEC_TYPE_DEFAULT:
904        result = kimage_load_normal_segment(image, segment);
905        break;
906    case KEXEC_TYPE_CRASH:
907        result = kimage_load_crash_segment(image, segment);
908        break;
909    }
910
911    return result;
912}
913
914/*
915 * Exec Kernel system call: for obvious reasons only root may call it.
916 *
917 * This call breaks up into three pieces.
918 * - A generic part which loads the new kernel from the current
919 * address space, and very carefully places the data in the
920 * allocated pages.
921 *
922 * - A generic part that interacts with the kernel and tells all of
923 * the devices to shut down. Preventing on-going dmas, and placing
924 * the devices in a consistent state so a later kernel can
925 * reinitialize them.
926 *
927 * - A machine specific part that includes the syscall number
928 * and then copies the image to it's final destination. And
929 * jumps into the image at entry.
930 *
931 * kexec does not sync, or unmount filesystems so if you need
932 * that to happen you need to do that yourself.
933 */
934struct kimage *kexec_image;
935struct kimage *kexec_crash_image;
936int kexec_load_disabled;
937
938static DEFINE_MUTEX(kexec_mutex);
939
940SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
941        struct kexec_segment __user *, segments, unsigned long, flags)
942{
943    struct kimage **dest_image, *image;
944    int result;
945
946    /* We only trust the superuser with rebooting the system. */
947    if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
948        return -EPERM;
949
950    /*
951     * Verify we have a legal set of flags
952     * This leaves us room for future extensions.
953     */
954    if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
955        return -EINVAL;
956
957    /* Verify we are on the appropriate architecture */
958    if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
959        ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
960        return -EINVAL;
961
962    /* Put an artificial cap on the number
963     * of segments passed to kexec_load.
964     */
965    if (nr_segments > KEXEC_SEGMENT_MAX)
966        return -EINVAL;
967
968    image = NULL;
969    result = 0;
970
971    /* Because we write directly to the reserved memory
972     * region when loading crash kernels we need a mutex here to
973     * prevent multiple crash kernels from attempting to load
974     * simultaneously, and to prevent a crash kernel from loading
975     * over the top of a in use crash kernel.
976     *
977     * KISS: always take the mutex.
978     */
979    if (!mutex_trylock(&kexec_mutex))
980        return -EBUSY;
981
982    dest_image = &kexec_image;
983    if (flags & KEXEC_ON_CRASH)
984        dest_image = &kexec_crash_image;
985    if (nr_segments > 0) {
986        unsigned long i;
987
988        /* Loading another kernel to reboot into */
989        if ((flags & KEXEC_ON_CRASH) == 0)
990            result = kimage_normal_alloc(&image, entry,
991                            nr_segments, segments);
992        /* Loading another kernel to switch to if this one crashes */
993        else if (flags & KEXEC_ON_CRASH) {
994            /* Free any current crash dump kernel before
995             * we corrupt it.
996             */
997            kimage_free(xchg(&kexec_crash_image, NULL));
998            result = kimage_crash_alloc(&image, entry,
999                             nr_segments, segments);
1000            crash_map_reserved_pages();
1001        }
1002        if (result)
1003            goto out;
1004
1005        if (flags & KEXEC_PRESERVE_CONTEXT)
1006            image->preserve_context = 1;
1007        result = machine_kexec_prepare(image);
1008        if (result)
1009            goto out;
1010
1011        for (i = 0; i < nr_segments; i++) {
1012            result = kimage_load_segment(image, &image->segment[i]);
1013            if (result)
1014                goto out;
1015        }
1016        kimage_terminate(image);
1017        if (flags & KEXEC_ON_CRASH)
1018            crash_unmap_reserved_pages();
1019    }
1020    /* Install the new kernel, and Uninstall the old */
1021    image = xchg(dest_image, image);
1022
1023out:
1024    mutex_unlock(&kexec_mutex);
1025    kimage_free(image);
1026
1027    return result;
1028}
1029
1030/*
1031 * Add and remove page tables for crashkernel memory
1032 *
1033 * Provide an empty default implementation here -- architecture
1034 * code may override this
1035 */
1036void __weak crash_map_reserved_pages(void)
1037{}
1038
1039void __weak crash_unmap_reserved_pages(void)
1040{}
1041
1042#ifdef CONFIG_COMPAT
1043COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1044               compat_ulong_t, nr_segments,
1045               struct compat_kexec_segment __user *, segments,
1046               compat_ulong_t, flags)
1047{
1048    struct compat_kexec_segment in;
1049    struct kexec_segment out, __user *ksegments;
1050    unsigned long i, result;
1051
1052    /* Don't allow clients that don't understand the native
1053     * architecture to do anything.
1054     */
1055    if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1056        return -EINVAL;
1057
1058    if (nr_segments > KEXEC_SEGMENT_MAX)
1059        return -EINVAL;
1060
1061    ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1062    for (i=0; i < nr_segments; i++) {
1063        result = copy_from_user(&in, &segments[i], sizeof(in));
1064        if (result)
1065            return -EFAULT;
1066
1067        out.buf = compat_ptr(in.buf);
1068        out.bufsz = in.bufsz;
1069        out.mem = in.mem;
1070        out.memsz = in.memsz;
1071
1072        result = copy_to_user(&ksegments[i], &out, sizeof(out));
1073        if (result)
1074            return -EFAULT;
1075    }
1076
1077    return sys_kexec_load(entry, nr_segments, ksegments, flags);
1078}
1079#endif
1080
1081void crash_kexec(struct pt_regs *regs)
1082{
1083    /* Take the kexec_mutex here to prevent sys_kexec_load
1084     * running on one cpu from replacing the crash kernel
1085     * we are using after a panic on a different cpu.
1086     *
1087     * If the crash kernel was not located in a fixed area
1088     * of memory the xchg(&kexec_crash_image) would be
1089     * sufficient. But since I reuse the memory...
1090     */
1091    if (mutex_trylock(&kexec_mutex)) {
1092        if (kexec_crash_image) {
1093            struct pt_regs fixed_regs;
1094
1095            crash_setup_regs(&fixed_regs, regs);
1096            crash_save_vmcoreinfo();
1097            machine_crash_shutdown(&fixed_regs);
1098            machine_kexec(kexec_crash_image);
1099        }
1100        mutex_unlock(&kexec_mutex);
1101    }
1102}
1103
1104size_t crash_get_memory_size(void)
1105{
1106    size_t size = 0;
1107    mutex_lock(&kexec_mutex);
1108    if (crashk_res.end != crashk_res.start)
1109        size = resource_size(&crashk_res);
1110    mutex_unlock(&kexec_mutex);
1111    return size;
1112}
1113
1114void __weak crash_free_reserved_phys_range(unsigned long begin,
1115                       unsigned long end)
1116{
1117    unsigned long addr;
1118
1119    for (addr = begin; addr < end; addr += PAGE_SIZE)
1120        free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
1121}
1122
1123int crash_shrink_memory(unsigned long new_size)
1124{
1125    int ret = 0;
1126    unsigned long start, end;
1127    unsigned long old_size;
1128    struct resource *ram_res;
1129
1130    mutex_lock(&kexec_mutex);
1131
1132    if (kexec_crash_image) {
1133        ret = -ENOENT;
1134        goto unlock;
1135    }
1136    start = crashk_res.start;
1137    end = crashk_res.end;
1138    old_size = (end == 0) ? 0 : end - start + 1;
1139    if (new_size >= old_size) {
1140        ret = (new_size == old_size) ? 0 : -EINVAL;
1141        goto unlock;
1142    }
1143
1144    ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1145    if (!ram_res) {
1146        ret = -ENOMEM;
1147        goto unlock;
1148    }
1149
1150    start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
1151    end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
1152
1153    crash_map_reserved_pages();
1154    crash_free_reserved_phys_range(end, crashk_res.end);
1155
1156    if ((start == end) && (crashk_res.parent != NULL))
1157        release_resource(&crashk_res);
1158
1159    ram_res->start = end;
1160    ram_res->end = crashk_res.end;
1161    ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1162    ram_res->name = "System RAM";
1163
1164    crashk_res.end = end - 1;
1165
1166    insert_resource(&iomem_resource, ram_res);
1167    crash_unmap_reserved_pages();
1168
1169unlock:
1170    mutex_unlock(&kexec_mutex);
1171    return ret;
1172}
1173
1174static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1175                size_t data_len)
1176{
1177    struct elf_note note;
1178
1179    note.n_namesz = strlen(name) + 1;
1180    note.n_descsz = data_len;
1181    note.n_type = type;
1182    memcpy(buf, &note, sizeof(note));
1183    buf += (sizeof(note) + 3)/4;
1184    memcpy(buf, name, note.n_namesz);
1185    buf += (note.n_namesz + 3)/4;
1186    memcpy(buf, data, note.n_descsz);
1187    buf += (note.n_descsz + 3)/4;
1188
1189    return buf;
1190}
1191
1192static void final_note(u32 *buf)
1193{
1194    struct elf_note note;
1195
1196    note.n_namesz = 0;
1197    note.n_descsz = 0;
1198    note.n_type = 0;
1199    memcpy(buf, &note, sizeof(note));
1200}
1201
1202void crash_save_cpu(struct pt_regs *regs, int cpu)
1203{
1204    struct elf_prstatus prstatus;
1205    u32 *buf;
1206
1207    if ((cpu < 0) || (cpu >= nr_cpu_ids))
1208        return;
1209
1210    /* Using ELF notes here is opportunistic.
1211     * I need a well defined structure format
1212     * for the data I pass, and I need tags
1213     * on the data to indicate what information I have
1214     * squirrelled away. ELF notes happen to provide
1215     * all of that, so there is no need to invent something new.
1216     */
1217    buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1218    if (!buf)
1219        return;
1220    memset(&prstatus, 0, sizeof(prstatus));
1221    prstatus.pr_pid = current->pid;
1222    elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1223    buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1224                        &prstatus, sizeof(prstatus));
1225    final_note(buf);
1226}
1227
1228static int __init crash_notes_memory_init(void)
1229{
1230    /* Allocate memory for saving cpu registers. */
1231    crash_notes = alloc_percpu(note_buf_t);
1232    if (!crash_notes) {
1233        printk("Kexec: Memory allocation for saving cpu register"
1234        " states failed\n");
1235        return -ENOMEM;
1236    }
1237    return 0;
1238}
1239subsys_initcall(crash_notes_memory_init);
1240
1241
1242/*
1243 * parsing the "crashkernel" commandline
1244 *
1245 * this code is intended to be called from architecture specific code
1246 */
1247
1248
1249/*
1250 * This function parses command lines in the format
1251 *
1252 * crashkernel=ramsize-range:size[,...][@offset]
1253 *
1254 * The function returns 0 on success and -EINVAL on failure.
1255 */
1256static int __init parse_crashkernel_mem(char *cmdline,
1257                    unsigned long long system_ram,
1258                    unsigned long long *crash_size,
1259                    unsigned long long *crash_base)
1260{
1261    char *cur = cmdline, *tmp;
1262
1263    /* for each entry of the comma-separated list */
1264    do {
1265        unsigned long long start, end = ULLONG_MAX, size;
1266
1267        /* get the start of the range */
1268        start = memparse(cur, &tmp);
1269        if (cur == tmp) {
1270            pr_warning("crashkernel: Memory value expected\n");
1271            return -EINVAL;
1272        }
1273        cur = tmp;
1274        if (*cur != '-') {
1275            pr_warning("crashkernel: '-' expected\n");
1276            return -EINVAL;
1277        }
1278        cur++;
1279
1280        /* if no ':' is here, than we read the end */
1281        if (*cur != ':') {
1282            end = memparse(cur, &tmp);
1283            if (cur == tmp) {
1284                pr_warning("crashkernel: Memory "
1285                        "value expected\n");
1286                return -EINVAL;
1287            }
1288            cur = tmp;
1289            if (end <= start) {
1290                pr_warning("crashkernel: end <= start\n");
1291                return -EINVAL;
1292            }
1293        }
1294
1295        if (*cur != ':') {
1296            pr_warning("crashkernel: ':' expected\n");
1297            return -EINVAL;
1298        }
1299        cur++;
1300
1301        size = memparse(cur, &tmp);
1302        if (cur == tmp) {
1303            pr_warning("Memory value expected\n");
1304            return -EINVAL;
1305        }
1306        cur = tmp;
1307        if (size >= system_ram) {
1308            pr_warning("crashkernel: invalid size\n");
1309            return -EINVAL;
1310        }
1311
1312        /* match ? */
1313        if (system_ram >= start && system_ram < end) {
1314            *crash_size = size;
1315            break;
1316        }
1317    } while (*cur++ == ',');
1318
1319    if (*crash_size > 0) {
1320        while (*cur && *cur != ' ' && *cur != '@')
1321            cur++;
1322        if (*cur == '@') {
1323            cur++;
1324            *crash_base = memparse(cur, &tmp);
1325            if (cur == tmp) {
1326                pr_warning("Memory value expected "
1327                        "after '@'\n");
1328                return -EINVAL;
1329            }
1330        }
1331    }
1332
1333    return 0;
1334}
1335
1336/*
1337 * That function parses "simple" (old) crashkernel command lines like
1338 *
1339 * crashkernel=size[@offset]
1340 *
1341 * It returns 0 on success and -EINVAL on failure.
1342 */
1343static int __init parse_crashkernel_simple(char *cmdline,
1344                       unsigned long long *crash_size,
1345                       unsigned long long *crash_base)
1346{
1347    char *cur = cmdline;
1348
1349    *crash_size = memparse(cmdline, &cur);
1350    if (cmdline == cur) {
1351        pr_warning("crashkernel: memory value expected\n");
1352        return -EINVAL;
1353    }
1354
1355    if (*cur == '@')
1356        *crash_base = memparse(cur+1, &cur);
1357    else if (*cur != ' ' && *cur != '\0') {
1358        pr_warning("crashkernel: unrecognized char\n");
1359        return -EINVAL;
1360    }
1361
1362    return 0;
1363}
1364
1365#define SUFFIX_HIGH 0
1366#define SUFFIX_LOW 1
1367#define SUFFIX_NULL 2
1368static __initdata char *suffix_tbl[] = {
1369    [SUFFIX_HIGH] = ",high",
1370    [SUFFIX_LOW] = ",low",
1371    [SUFFIX_NULL] = NULL,
1372};
1373
1374/*
1375 * That function parses "suffix" crashkernel command lines like
1376 *
1377 * crashkernel=size,[high|low]
1378 *
1379 * It returns 0 on success and -EINVAL on failure.
1380 */
1381static int __init parse_crashkernel_suffix(char *cmdline,
1382                       unsigned long long *crash_size,
1383                       unsigned long long *crash_base,
1384                       const char *suffix)
1385{
1386    char *cur = cmdline;
1387
1388    *crash_size = memparse(cmdline, &cur);
1389    if (cmdline == cur) {
1390        pr_warn("crashkernel: memory value expected\n");
1391        return -EINVAL;
1392    }
1393
1394    /* check with suffix */
1395    if (strncmp(cur, suffix, strlen(suffix))) {
1396        pr_warn("crashkernel: unrecognized char\n");
1397        return -EINVAL;
1398    }
1399    cur += strlen(suffix);
1400    if (*cur != ' ' && *cur != '\0') {
1401        pr_warn("crashkernel: unrecognized char\n");
1402        return -EINVAL;
1403    }
1404
1405    return 0;
1406}
1407
1408static __init char *get_last_crashkernel(char *cmdline,
1409                 const char *name,
1410                 const char *suffix)
1411{
1412    char *p = cmdline, *ck_cmdline = NULL;
1413
1414    /* find crashkernel and use the last one if there are more */
1415    p = strstr(p, name);
1416    while (p) {
1417        char *end_p = strchr(p, ' ');
1418        char *q;
1419
1420        if (!end_p)
1421            end_p = p + strlen(p);
1422
1423        if (!suffix) {
1424            int i;
1425
1426            /* skip the one with any known suffix */
1427            for (i = 0; suffix_tbl[i]; i++) {
1428                q = end_p - strlen(suffix_tbl[i]);
1429                if (!strncmp(q, suffix_tbl[i],
1430                         strlen(suffix_tbl[i])))
1431                    goto next;
1432            }
1433            ck_cmdline = p;
1434        } else {
1435            q = end_p - strlen(suffix);
1436            if (!strncmp(q, suffix, strlen(suffix)))
1437                ck_cmdline = p;
1438        }
1439next:
1440        p = strstr(p+1, name);
1441    }
1442
1443    if (!ck_cmdline)
1444        return NULL;
1445
1446    return ck_cmdline;
1447}
1448
1449static int __init __parse_crashkernel(char *cmdline,
1450                 unsigned long long system_ram,
1451                 unsigned long long *crash_size,
1452                 unsigned long long *crash_base,
1453                 const char *name,
1454                 const char *suffix)
1455{
1456    char *first_colon, *first_space;
1457    char *ck_cmdline;
1458
1459    BUG_ON(!crash_size || !crash_base);
1460    *crash_size = 0;
1461    *crash_base = 0;
1462
1463    ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1464
1465    if (!ck_cmdline)
1466        return -EINVAL;
1467
1468    ck_cmdline += strlen(name);
1469
1470    if (suffix)
1471        return parse_crashkernel_suffix(ck_cmdline, crash_size,
1472                crash_base, suffix);
1473    /*
1474     * if the commandline contains a ':', then that's the extended
1475     * syntax -- if not, it must be the classic syntax
1476     */
1477    first_colon = strchr(ck_cmdline, ':');
1478    first_space = strchr(ck_cmdline, ' ');
1479    if (first_colon && (!first_space || first_colon < first_space))
1480        return parse_crashkernel_mem(ck_cmdline, system_ram,
1481                crash_size, crash_base);
1482
1483    return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1484}
1485
1486/*
1487 * That function is the entry point for command line parsing and should be
1488 * called from the arch-specific code.
1489 */
1490int __init parse_crashkernel(char *cmdline,
1491                 unsigned long long system_ram,
1492                 unsigned long long *crash_size,
1493                 unsigned long long *crash_base)
1494{
1495    return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1496                    "crashkernel=", NULL);
1497}
1498
1499int __init parse_crashkernel_high(char *cmdline,
1500                 unsigned long long system_ram,
1501                 unsigned long long *crash_size,
1502                 unsigned long long *crash_base)
1503{
1504    return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1505                "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1506}
1507
1508int __init parse_crashkernel_low(char *cmdline,
1509                 unsigned long long system_ram,
1510                 unsigned long long *crash_size,
1511                 unsigned long long *crash_base)
1512{
1513    return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1514                "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1515}
1516
1517static void update_vmcoreinfo_note(void)
1518{
1519    u32 *buf = vmcoreinfo_note;
1520
1521    if (!vmcoreinfo_size)
1522        return;
1523    buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1524                  vmcoreinfo_size);
1525    final_note(buf);
1526}
1527
1528void crash_save_vmcoreinfo(void)
1529{
1530    vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1531    update_vmcoreinfo_note();
1532}
1533
1534void vmcoreinfo_append_str(const char *fmt, ...)
1535{
1536    va_list args;
1537    char buf[0x50];
1538    size_t r;
1539
1540    va_start(args, fmt);
1541    r = vscnprintf(buf, sizeof(buf), fmt, args);
1542    va_end(args);
1543
1544    r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1545
1546    memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1547
1548    vmcoreinfo_size += r;
1549}
1550
1551/*
1552 * provide an empty default implementation here -- architecture
1553 * code may override this
1554 */
1555void __weak arch_crash_save_vmcoreinfo(void)
1556{}
1557
1558unsigned long __weak paddr_vmcoreinfo_note(void)
1559{
1560    return __pa((unsigned long)(char *)&vmcoreinfo_note);
1561}
1562
1563static int __init crash_save_vmcoreinfo_init(void)
1564{
1565    VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1566    VMCOREINFO_PAGESIZE(PAGE_SIZE);
1567
1568    VMCOREINFO_SYMBOL(init_uts_ns);
1569    VMCOREINFO_SYMBOL(node_online_map);
1570#ifdef CONFIG_MMU
1571    VMCOREINFO_SYMBOL(swapper_pg_dir);
1572#endif
1573    VMCOREINFO_SYMBOL(_stext);
1574    VMCOREINFO_SYMBOL(vmap_area_list);
1575
1576#ifndef CONFIG_NEED_MULTIPLE_NODES
1577    VMCOREINFO_SYMBOL(mem_map);
1578    VMCOREINFO_SYMBOL(contig_page_data);
1579#endif
1580#ifdef CONFIG_SPARSEMEM
1581    VMCOREINFO_SYMBOL(mem_section);
1582    VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1583    VMCOREINFO_STRUCT_SIZE(mem_section);
1584    VMCOREINFO_OFFSET(mem_section, section_mem_map);
1585#endif
1586    VMCOREINFO_STRUCT_SIZE(page);
1587    VMCOREINFO_STRUCT_SIZE(pglist_data);
1588    VMCOREINFO_STRUCT_SIZE(zone);
1589    VMCOREINFO_STRUCT_SIZE(free_area);
1590    VMCOREINFO_STRUCT_SIZE(list_head);
1591    VMCOREINFO_SIZE(nodemask_t);
1592    VMCOREINFO_OFFSET(page, flags);
1593    VMCOREINFO_OFFSET(page, _count);
1594    VMCOREINFO_OFFSET(page, mapping);
1595    VMCOREINFO_OFFSET(page, lru);
1596    VMCOREINFO_OFFSET(page, _mapcount);
1597    VMCOREINFO_OFFSET(page, private);
1598    VMCOREINFO_OFFSET(pglist_data, node_zones);
1599    VMCOREINFO_OFFSET(pglist_data, nr_zones);
1600#ifdef CONFIG_FLAT_NODE_MEM_MAP
1601    VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1602#endif
1603    VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1604    VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1605    VMCOREINFO_OFFSET(pglist_data, node_id);
1606    VMCOREINFO_OFFSET(zone, free_area);
1607    VMCOREINFO_OFFSET(zone, vm_stat);
1608    VMCOREINFO_OFFSET(zone, spanned_pages);
1609    VMCOREINFO_OFFSET(free_area, free_list);
1610    VMCOREINFO_OFFSET(list_head, next);
1611    VMCOREINFO_OFFSET(list_head, prev);
1612    VMCOREINFO_OFFSET(vmap_area, va_start);
1613    VMCOREINFO_OFFSET(vmap_area, list);
1614    VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1615    log_buf_kexec_setup();
1616    VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1617    VMCOREINFO_NUMBER(NR_FREE_PAGES);
1618    VMCOREINFO_NUMBER(PG_lru);
1619    VMCOREINFO_NUMBER(PG_private);
1620    VMCOREINFO_NUMBER(PG_swapcache);
1621    VMCOREINFO_NUMBER(PG_slab);
1622#ifdef CONFIG_MEMORY_FAILURE
1623    VMCOREINFO_NUMBER(PG_hwpoison);
1624#endif
1625    VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1626
1627    arch_crash_save_vmcoreinfo();
1628    update_vmcoreinfo_note();
1629
1630    return 0;
1631}
1632
1633subsys_initcall(crash_save_vmcoreinfo_init);
1634
1635/*
1636 * Move into place and start executing a preloaded standalone
1637 * executable. If nothing was preloaded return an error.
1638 */
1639int kernel_kexec(void)
1640{
1641    int error = 0;
1642
1643    if (!mutex_trylock(&kexec_mutex))
1644        return -EBUSY;
1645    if (!kexec_image) {
1646        error = -EINVAL;
1647        goto Unlock;
1648    }
1649
1650#ifdef CONFIG_KEXEC_JUMP
1651    if (kexec_image->preserve_context) {
1652        lock_system_sleep();
1653        pm_prepare_console();
1654        error = freeze_processes();
1655        if (error) {
1656            error = -EBUSY;
1657            goto Restore_console;
1658        }
1659        suspend_console();
1660        error = dpm_suspend_start(PMSG_FREEZE);
1661        if (error)
1662            goto Resume_console;
1663        /* At this point, dpm_suspend_start() has been called,
1664         * but *not* dpm_suspend_end(). We *must* call
1665         * dpm_suspend_end() now. Otherwise, drivers for
1666         * some devices (e.g. interrupt controllers) become
1667         * desynchronized with the actual state of the
1668         * hardware at resume time, and evil weirdness ensues.
1669         */
1670        error = dpm_suspend_end(PMSG_FREEZE);
1671        if (error)
1672            goto Resume_devices;
1673        error = disable_nonboot_cpus();
1674        if (error)
1675            goto Enable_cpus;
1676        local_irq_disable();
1677        error = syscore_suspend();
1678        if (error)
1679            goto Enable_irqs;
1680    } else
1681#endif
1682    {
1683        kexec_in_progress = true;
1684        kernel_restart_prepare(NULL);
1685        migrate_to_reboot_cpu();
1686
1687        /*
1688         * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1689         * no further code needs to use CPU hotplug (which is true in
1690         * the reboot case). However, the kexec path depends on using
1691         * CPU hotplug again; so re-enable it here.
1692         */
1693        cpu_hotplug_enable();
1694        printk(KERN_EMERG "Starting new kernel\n");
1695        machine_shutdown();
1696    }
1697
1698    machine_kexec(kexec_image);
1699
1700#ifdef CONFIG_KEXEC_JUMP
1701    if (kexec_image->preserve_context) {
1702        syscore_resume();
1703 Enable_irqs:
1704        local_irq_enable();
1705 Enable_cpus:
1706        enable_nonboot_cpus();
1707        dpm_resume_start(PMSG_RESTORE);
1708 Resume_devices:
1709        dpm_resume_end(PMSG_RESTORE);
1710 Resume_console:
1711        resume_console();
1712        thaw_processes();
1713 Restore_console:
1714        pm_restore_console();
1715        unlock_system_sleep();
1716    }
1717#endif
1718
1719 Unlock:
1720    mutex_unlock(&kexec_mutex);
1721    return error;
1722}
1723

Archive Download this file



interactive