Root/fs/exec.c

1/*
2 * linux/fs/exec.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7/*
8 * #!-checking implemented by tytso.
9 */
10/*
11 * Demand-loading implemented 01.12.91 - no need to read anything but
12 * the header into memory. The inode of the executable is put into
13 * "current->executable", and page faults do the actual loading. Clean.
14 *
15 * Once more I can proudly say that linux stood up to being changed: it
16 * was less than 2 hours work to get demand-loading completely implemented.
17 *
18 * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
19 * current->executable is only used by the procfs. This allows a dispatch
20 * table to check for several different types of binary formats. We keep
21 * trying until we recognize the file or we run out of supported binary
22 * formats.
23 */
24
25#include <linux/slab.h>
26#include <linux/file.h>
27#include <linux/fdtable.h>
28#include <linux/mm.h>
29#include <linux/stat.h>
30#include <linux/fcntl.h>
31#include <linux/smp_lock.h>
32#include <linux/swap.h>
33#include <linux/string.h>
34#include <linux/init.h>
35#include <linux/pagemap.h>
36#include <linux/perf_event.h>
37#include <linux/highmem.h>
38#include <linux/spinlock.h>
39#include <linux/key.h>
40#include <linux/personality.h>
41#include <linux/binfmts.h>
42#include <linux/utsname.h>
43#include <linux/pid_namespace.h>
44#include <linux/module.h>
45#include <linux/namei.h>
46#include <linux/proc_fs.h>
47#include <linux/mount.h>
48#include <linux/security.h>
49#include <linux/syscalls.h>
50#include <linux/tsacct_kern.h>
51#include <linux/cn_proc.h>
52#include <linux/audit.h>
53#include <linux/tracehook.h>
54#include <linux/kmod.h>
55#include <linux/fsnotify.h>
56#include <linux/fs_struct.h>
57#include <linux/pipe_fs_i.h>
58
59#include <asm/uaccess.h>
60#include <asm/mmu_context.h>
61#include <asm/tlb.h>
62#include "internal.h"
63
64int core_uses_pid;
65char core_pattern[CORENAME_MAX_SIZE] = "core";
66unsigned int core_pipe_limit;
67int suid_dumpable = 0;
68
69/* The maximal length of core_pattern is also specified in sysctl.c */
70
71static LIST_HEAD(formats);
72static DEFINE_RWLOCK(binfmt_lock);
73
74int __register_binfmt(struct linux_binfmt * fmt, int insert)
75{
76    if (!fmt)
77        return -EINVAL;
78    write_lock(&binfmt_lock);
79    insert ? list_add(&fmt->lh, &formats) :
80         list_add_tail(&fmt->lh, &formats);
81    write_unlock(&binfmt_lock);
82    return 0;
83}
84
85EXPORT_SYMBOL(__register_binfmt);
86
87void unregister_binfmt(struct linux_binfmt * fmt)
88{
89    write_lock(&binfmt_lock);
90    list_del(&fmt->lh);
91    write_unlock(&binfmt_lock);
92}
93
94EXPORT_SYMBOL(unregister_binfmt);
95
96static inline void put_binfmt(struct linux_binfmt * fmt)
97{
98    module_put(fmt->module);
99}
100
101/*
102 * Note that a shared library must be both readable and executable due to
103 * security reasons.
104 *
105 * Also note that we take the address to load from from the file itself.
106 */
107SYSCALL_DEFINE1(uselib, const char __user *, library)
108{
109    struct file *file;
110    char *tmp = getname(library);
111    int error = PTR_ERR(tmp);
112
113    if (IS_ERR(tmp))
114        goto out;
115
116    file = do_filp_open(AT_FDCWD, tmp,
117                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
118                MAY_READ | MAY_EXEC | MAY_OPEN);
119    putname(tmp);
120    error = PTR_ERR(file);
121    if (IS_ERR(file))
122        goto out;
123
124    error = -EINVAL;
125    if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
126        goto exit;
127
128    error = -EACCES;
129    if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
130        goto exit;
131
132    fsnotify_open(file->f_path.dentry);
133
134    error = -ENOEXEC;
135    if(file->f_op) {
136        struct linux_binfmt * fmt;
137
138        read_lock(&binfmt_lock);
139        list_for_each_entry(fmt, &formats, lh) {
140            if (!fmt->load_shlib)
141                continue;
142            if (!try_module_get(fmt->module))
143                continue;
144            read_unlock(&binfmt_lock);
145            error = fmt->load_shlib(file);
146            read_lock(&binfmt_lock);
147            put_binfmt(fmt);
148            if (error != -ENOEXEC)
149                break;
150        }
151        read_unlock(&binfmt_lock);
152    }
153exit:
154    fput(file);
155out:
156      return error;
157}
158
159#ifdef CONFIG_MMU
160
161static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
162        int write)
163{
164    struct page *page;
165    int ret;
166
167#ifdef CONFIG_STACK_GROWSUP
168    if (write) {
169        ret = expand_stack_downwards(bprm->vma, pos);
170        if (ret < 0)
171            return NULL;
172    }
173#endif
174    ret = get_user_pages(current, bprm->mm, pos,
175            1, write, 1, &page, NULL);
176    if (ret <= 0)
177        return NULL;
178
179    if (write) {
180        unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
181        struct rlimit *rlim;
182
183        /*
184         * We've historically supported up to 32 pages (ARG_MAX)
185         * of argument strings even with small stacks
186         */
187        if (size <= ARG_MAX)
188            return page;
189
190        /*
191         * Limit to 1/4-th the stack size for the argv+env strings.
192         * This ensures that:
193         * - the remaining binfmt code will not run out of stack space,
194         * - the program will have a reasonable amount of stack left
195         * to work from.
196         */
197        rlim = current->signal->rlim;
198        if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
199            put_page(page);
200            return NULL;
201        }
202    }
203
204    return page;
205}
206
207static void put_arg_page(struct page *page)
208{
209    put_page(page);
210}
211
212static void free_arg_page(struct linux_binprm *bprm, int i)
213{
214}
215
216static void free_arg_pages(struct linux_binprm *bprm)
217{
218}
219
220static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
221        struct page *page)
222{
223    flush_cache_page(bprm->vma, pos, page_to_pfn(page));
224}
225
226static int __bprm_mm_init(struct linux_binprm *bprm)
227{
228    int err;
229    struct vm_area_struct *vma = NULL;
230    struct mm_struct *mm = bprm->mm;
231
232    bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
233    if (!vma)
234        return -ENOMEM;
235
236    down_write(&mm->mmap_sem);
237    vma->vm_mm = mm;
238
239    /*
240     * Place the stack at the largest stack address the architecture
241     * supports. Later, we'll move this to an appropriate place. We don't
242     * use STACK_TOP because that can depend on attributes which aren't
243     * configured yet.
244     */
245    vma->vm_end = STACK_TOP_MAX;
246    vma->vm_start = vma->vm_end - PAGE_SIZE;
247    vma->vm_flags = VM_STACK_FLAGS;
248    vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249    INIT_LIST_HEAD(&vma->anon_vma_chain);
250    err = insert_vm_struct(mm, vma);
251    if (err)
252        goto err;
253
254    mm->stack_vm = mm->total_vm = 1;
255    up_write(&mm->mmap_sem);
256    bprm->p = vma->vm_end - sizeof(void *);
257    return 0;
258err:
259    up_write(&mm->mmap_sem);
260    bprm->vma = NULL;
261    kmem_cache_free(vm_area_cachep, vma);
262    return err;
263}
264
265static bool valid_arg_len(struct linux_binprm *bprm, long len)
266{
267    return len <= MAX_ARG_STRLEN;
268}
269
270#else
271
272static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
273        int write)
274{
275    struct page *page;
276
277    page = bprm->page[pos / PAGE_SIZE];
278    if (!page && write) {
279        page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
280        if (!page)
281            return NULL;
282        bprm->page[pos / PAGE_SIZE] = page;
283    }
284
285    return page;
286}
287
288static void put_arg_page(struct page *page)
289{
290}
291
292static void free_arg_page(struct linux_binprm *bprm, int i)
293{
294    if (bprm->page[i]) {
295        __free_page(bprm->page[i]);
296        bprm->page[i] = NULL;
297    }
298}
299
300static void free_arg_pages(struct linux_binprm *bprm)
301{
302    int i;
303
304    for (i = 0; i < MAX_ARG_PAGES; i++)
305        free_arg_page(bprm, i);
306}
307
308static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
309        struct page *page)
310{
311}
312
313static int __bprm_mm_init(struct linux_binprm *bprm)
314{
315    bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
316    return 0;
317}
318
319static bool valid_arg_len(struct linux_binprm *bprm, long len)
320{
321    return len <= bprm->p;
322}
323
324#endif /* CONFIG_MMU */
325
326/*
327 * Create a new mm_struct and populate it with a temporary stack
328 * vm_area_struct. We don't have enough context at this point to set the stack
329 * flags, permissions, and offset, so we use temporary values. We'll update
330 * them later in setup_arg_pages().
331 */
332int bprm_mm_init(struct linux_binprm *bprm)
333{
334    int err;
335    struct mm_struct *mm = NULL;
336
337    bprm->mm = mm = mm_alloc();
338    err = -ENOMEM;
339    if (!mm)
340        goto err;
341
342    err = init_new_context(current, mm);
343    if (err)
344        goto err;
345
346    err = __bprm_mm_init(bprm);
347    if (err)
348        goto err;
349
350    return 0;
351
352err:
353    if (mm) {
354        bprm->mm = NULL;
355        mmdrop(mm);
356    }
357
358    return err;
359}
360
361/*
362 * count() counts the number of strings in array ARGV.
363 */
364static int count(char __user * __user * argv, int max)
365{
366    int i = 0;
367
368    if (argv != NULL) {
369        for (;;) {
370            char __user * p;
371
372            if (get_user(p, argv))
373                return -EFAULT;
374            if (!p)
375                break;
376            argv++;
377            if (i++ >= max)
378                return -E2BIG;
379            cond_resched();
380        }
381    }
382    return i;
383}
384
385/*
386 * 'copy_strings()' copies argument/environment strings from the old
387 * processes's memory to the new process's stack. The call to get_user_pages()
388 * ensures the destination page is created and not swapped out.
389 */
390static int copy_strings(int argc, char __user * __user * argv,
391            struct linux_binprm *bprm)
392{
393    struct page *kmapped_page = NULL;
394    char *kaddr = NULL;
395    unsigned long kpos = 0;
396    int ret;
397
398    while (argc-- > 0) {
399        char __user *str;
400        int len;
401        unsigned long pos;
402
403        if (get_user(str, argv+argc) ||
404                !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
405            ret = -EFAULT;
406            goto out;
407        }
408
409        if (!valid_arg_len(bprm, len)) {
410            ret = -E2BIG;
411            goto out;
412        }
413
414        /* We're going to work our way backwords. */
415        pos = bprm->p;
416        str += len;
417        bprm->p -= len;
418
419        while (len > 0) {
420            int offset, bytes_to_copy;
421
422            offset = pos % PAGE_SIZE;
423            if (offset == 0)
424                offset = PAGE_SIZE;
425
426            bytes_to_copy = offset;
427            if (bytes_to_copy > len)
428                bytes_to_copy = len;
429
430            offset -= bytes_to_copy;
431            pos -= bytes_to_copy;
432            str -= bytes_to_copy;
433            len -= bytes_to_copy;
434
435            if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
436                struct page *page;
437
438                page = get_arg_page(bprm, pos, 1);
439                if (!page) {
440                    ret = -E2BIG;
441                    goto out;
442                }
443
444                if (kmapped_page) {
445                    flush_kernel_dcache_page(kmapped_page);
446                    kunmap(kmapped_page);
447                    put_arg_page(kmapped_page);
448                }
449                kmapped_page = page;
450                kaddr = kmap(kmapped_page);
451                kpos = pos & PAGE_MASK;
452                flush_arg_page(bprm, kpos, kmapped_page);
453            }
454            if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
455                ret = -EFAULT;
456                goto out;
457            }
458        }
459    }
460    ret = 0;
461out:
462    if (kmapped_page) {
463        flush_kernel_dcache_page(kmapped_page);
464        kunmap(kmapped_page);
465        put_arg_page(kmapped_page);
466    }
467    return ret;
468}
469
470/*
471 * Like copy_strings, but get argv and its values from kernel memory.
472 */
473int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
474{
475    int r;
476    mm_segment_t oldfs = get_fs();
477    set_fs(KERNEL_DS);
478    r = copy_strings(argc, (char __user * __user *)argv, bprm);
479    set_fs(oldfs);
480    return r;
481}
482EXPORT_SYMBOL(copy_strings_kernel);
483
484#ifdef CONFIG_MMU
485
486/*
487 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
488 * the binfmt code determines where the new stack should reside, we shift it to
489 * its final location. The process proceeds as follows:
490 *
491 * 1) Use shift to calculate the new vma endpoints.
492 * 2) Extend vma to cover both the old and new ranges. This ensures the
493 * arguments passed to subsequent functions are consistent.
494 * 3) Move vma's page tables to the new range.
495 * 4) Free up any cleared pgd range.
496 * 5) Shrink the vma to cover only the new range.
497 */
498static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
499{
500    struct mm_struct *mm = vma->vm_mm;
501    unsigned long old_start = vma->vm_start;
502    unsigned long old_end = vma->vm_end;
503    unsigned long length = old_end - old_start;
504    unsigned long new_start = old_start - shift;
505    unsigned long new_end = old_end - shift;
506    struct mmu_gather *tlb;
507
508    BUG_ON(new_start > new_end);
509
510    /*
511     * ensure there are no vmas between where we want to go
512     * and where we are
513     */
514    if (vma != find_vma(mm, new_start))
515        return -EFAULT;
516
517    /*
518     * cover the whole range: [new_start, old_end)
519     */
520    if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
521        return -ENOMEM;
522
523    /*
524     * move the page tables downwards, on failure we rely on
525     * process cleanup to remove whatever mess we made.
526     */
527    if (length != move_page_tables(vma, old_start,
528                       vma, new_start, length))
529        return -ENOMEM;
530
531    lru_add_drain();
532    tlb = tlb_gather_mmu(mm, 0);
533    if (new_end > old_start) {
534        /*
535         * when the old and new regions overlap clear from new_end.
536         */
537        free_pgd_range(tlb, new_end, old_end, new_end,
538            vma->vm_next ? vma->vm_next->vm_start : 0);
539    } else {
540        /*
541         * otherwise, clean from old_start; this is done to not touch
542         * the address space in [new_end, old_start) some architectures
543         * have constraints on va-space that make this illegal (IA64) -
544         * for the others its just a little faster.
545         */
546        free_pgd_range(tlb, old_start, old_end, new_end,
547            vma->vm_next ? vma->vm_next->vm_start : 0);
548    }
549    tlb_finish_mmu(tlb, new_end, old_end);
550
551    /*
552     * Shrink the vma to just the new range. Always succeeds.
553     */
554    vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
555
556    return 0;
557}
558
559/*
560 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
561 * the stack is optionally relocated, and some extra space is added.
562 */
563int setup_arg_pages(struct linux_binprm *bprm,
564            unsigned long stack_top,
565            int executable_stack)
566{
567    unsigned long ret;
568    unsigned long stack_shift;
569    struct mm_struct *mm = current->mm;
570    struct vm_area_struct *vma = bprm->vma;
571    struct vm_area_struct *prev = NULL;
572    unsigned long vm_flags;
573    unsigned long stack_base;
574    unsigned long stack_size;
575    unsigned long stack_expand;
576    unsigned long rlim_stack;
577
578#ifdef CONFIG_STACK_GROWSUP
579    /* Limit stack size to 1GB */
580    stack_base = rlimit_max(RLIMIT_STACK);
581    if (stack_base > (1 << 30))
582        stack_base = 1 << 30;
583
584    /* Make sure we didn't let the argument array grow too large. */
585    if (vma->vm_end - vma->vm_start > stack_base)
586        return -ENOMEM;
587
588    stack_base = PAGE_ALIGN(stack_top - stack_base);
589
590    stack_shift = vma->vm_start - stack_base;
591    mm->arg_start = bprm->p - stack_shift;
592    bprm->p = vma->vm_end - stack_shift;
593#else
594    stack_top = arch_align_stack(stack_top);
595    stack_top = PAGE_ALIGN(stack_top);
596    stack_shift = vma->vm_end - stack_top;
597
598    bprm->p -= stack_shift;
599    mm->arg_start = bprm->p;
600#endif
601
602    if (bprm->loader)
603        bprm->loader -= stack_shift;
604    bprm->exec -= stack_shift;
605
606    down_write(&mm->mmap_sem);
607    vm_flags = VM_STACK_FLAGS;
608
609    /*
610     * Adjust stack execute permissions; explicitly enable for
611     * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
612     * (arch default) otherwise.
613     */
614    if (unlikely(executable_stack == EXSTACK_ENABLE_X))
615        vm_flags |= VM_EXEC;
616    else if (executable_stack == EXSTACK_DISABLE_X)
617        vm_flags &= ~VM_EXEC;
618    vm_flags |= mm->def_flags;
619
620    ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
621            vm_flags);
622    if (ret)
623        goto out_unlock;
624    BUG_ON(prev != vma);
625
626    /* Move stack pages down in memory. */
627    if (stack_shift) {
628        ret = shift_arg_pages(vma, stack_shift);
629        if (ret)
630            goto out_unlock;
631    }
632
633    stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634    stack_size = vma->vm_end - vma->vm_start;
635    /*
636     * Align this down to a page boundary as expand_stack
637     * will align it up.
638     */
639    rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
640#ifdef CONFIG_STACK_GROWSUP
641    if (stack_size + stack_expand > rlim_stack)
642        stack_base = vma->vm_start + rlim_stack;
643    else
644        stack_base = vma->vm_end + stack_expand;
645#else
646    if (stack_size + stack_expand > rlim_stack)
647        stack_base = vma->vm_end - rlim_stack;
648    else
649        stack_base = vma->vm_start - stack_expand;
650#endif
651    ret = expand_stack(vma, stack_base);
652    if (ret)
653        ret = -EFAULT;
654
655out_unlock:
656    up_write(&mm->mmap_sem);
657    return ret;
658}
659EXPORT_SYMBOL(setup_arg_pages);
660
661#endif /* CONFIG_MMU */
662
663struct file *open_exec(const char *name)
664{
665    struct file *file;
666    int err;
667
668    file = do_filp_open(AT_FDCWD, name,
669                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
670                MAY_EXEC | MAY_OPEN);
671    if (IS_ERR(file))
672        goto out;
673
674    err = -EACCES;
675    if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
676        goto exit;
677
678    if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
679        goto exit;
680
681    fsnotify_open(file->f_path.dentry);
682
683    err = deny_write_access(file);
684    if (err)
685        goto exit;
686
687out:
688    return file;
689
690exit:
691    fput(file);
692    return ERR_PTR(err);
693}
694EXPORT_SYMBOL(open_exec);
695
696int kernel_read(struct file *file, loff_t offset,
697        char *addr, unsigned long count)
698{
699    mm_segment_t old_fs;
700    loff_t pos = offset;
701    int result;
702
703    old_fs = get_fs();
704    set_fs(get_ds());
705    /* The cast to a user pointer is valid due to the set_fs() */
706    result = vfs_read(file, (void __user *)addr, count, &pos);
707    set_fs(old_fs);
708    return result;
709}
710
711EXPORT_SYMBOL(kernel_read);
712
713static int exec_mmap(struct mm_struct *mm)
714{
715    struct task_struct *tsk;
716    struct mm_struct * old_mm, *active_mm;
717
718    /* Notify parent that we're no longer interested in the old VM */
719    tsk = current;
720    old_mm = current->mm;
721    sync_mm_rss(tsk, old_mm);
722    mm_release(tsk, old_mm);
723
724    if (old_mm) {
725        /*
726         * Make sure that if there is a core dump in progress
727         * for the old mm, we get out and die instead of going
728         * through with the exec. We must hold mmap_sem around
729         * checking core_state and changing tsk->mm.
730         */
731        down_read(&old_mm->mmap_sem);
732        if (unlikely(old_mm->core_state)) {
733            up_read(&old_mm->mmap_sem);
734            return -EINTR;
735        }
736    }
737    task_lock(tsk);
738    active_mm = tsk->active_mm;
739    tsk->mm = mm;
740    tsk->active_mm = mm;
741    activate_mm(active_mm, mm);
742    task_unlock(tsk);
743    arch_pick_mmap_layout(mm);
744    if (old_mm) {
745        up_read(&old_mm->mmap_sem);
746        BUG_ON(active_mm != old_mm);
747        mm_update_next_owner(old_mm);
748        mmput(old_mm);
749        return 0;
750    }
751    mmdrop(active_mm);
752    return 0;
753}
754
755/*
756 * This function makes sure the current process has its own signal table,
757 * so that flush_signal_handlers can later reset the handlers without
758 * disturbing other processes. (Other processes might share the signal
759 * table via the CLONE_SIGHAND option to clone().)
760 */
761static int de_thread(struct task_struct *tsk)
762{
763    struct signal_struct *sig = tsk->signal;
764    struct sighand_struct *oldsighand = tsk->sighand;
765    spinlock_t *lock = &oldsighand->siglock;
766    int count;
767
768    if (thread_group_empty(tsk))
769        goto no_thread_group;
770
771    /*
772     * Kill all other threads in the thread group.
773     */
774    spin_lock_irq(lock);
775    if (signal_group_exit(sig)) {
776        /*
777         * Another group action in progress, just
778         * return so that the signal is processed.
779         */
780        spin_unlock_irq(lock);
781        return -EAGAIN;
782    }
783    sig->group_exit_task = tsk;
784    zap_other_threads(tsk);
785
786    /* Account for the thread group leader hanging around: */
787    count = thread_group_leader(tsk) ? 1 : 2;
788    sig->notify_count = count;
789    while (atomic_read(&sig->count) > count) {
790        __set_current_state(TASK_UNINTERRUPTIBLE);
791        spin_unlock_irq(lock);
792        schedule();
793        spin_lock_irq(lock);
794    }
795    spin_unlock_irq(lock);
796
797    /*
798     * At this point all other threads have exited, all we have to
799     * do is to wait for the thread group leader to become inactive,
800     * and to assume its PID:
801     */
802    if (!thread_group_leader(tsk)) {
803        struct task_struct *leader = tsk->group_leader;
804
805        sig->notify_count = -1; /* for exit_notify() */
806        for (;;) {
807            write_lock_irq(&tasklist_lock);
808            if (likely(leader->exit_state))
809                break;
810            __set_current_state(TASK_UNINTERRUPTIBLE);
811            write_unlock_irq(&tasklist_lock);
812            schedule();
813        }
814
815        /*
816         * The only record we have of the real-time age of a
817         * process, regardless of execs it's done, is start_time.
818         * All the past CPU time is accumulated in signal_struct
819         * from sister threads now dead. But in this non-leader
820         * exec, nothing survives from the original leader thread,
821         * whose birth marks the true age of this process now.
822         * When we take on its identity by switching to its PID, we
823         * also take its birthdate (always earlier than our own).
824         */
825        tsk->start_time = leader->start_time;
826
827        BUG_ON(!same_thread_group(leader, tsk));
828        BUG_ON(has_group_leader_pid(tsk));
829        /*
830         * An exec() starts a new thread group with the
831         * TGID of the previous thread group. Rehash the
832         * two threads with a switched PID, and release
833         * the former thread group leader:
834         */
835
836        /* Become a process group leader with the old leader's pid.
837         * The old leader becomes a thread of the this thread group.
838         * Note: The old leader also uses this pid until release_task
839         * is called. Odd but simple and correct.
840         */
841        detach_pid(tsk, PIDTYPE_PID);
842        tsk->pid = leader->pid;
843        attach_pid(tsk, PIDTYPE_PID, task_pid(leader));
844        transfer_pid(leader, tsk, PIDTYPE_PGID);
845        transfer_pid(leader, tsk, PIDTYPE_SID);
846
847        list_replace_rcu(&leader->tasks, &tsk->tasks);
848        list_replace_init(&leader->sibling, &tsk->sibling);
849
850        tsk->group_leader = tsk;
851        leader->group_leader = tsk;
852
853        tsk->exit_signal = SIGCHLD;
854
855        BUG_ON(leader->exit_state != EXIT_ZOMBIE);
856        leader->exit_state = EXIT_DEAD;
857        write_unlock_irq(&tasklist_lock);
858
859        release_task(leader);
860    }
861
862    sig->group_exit_task = NULL;
863    sig->notify_count = 0;
864
865no_thread_group:
866    if (current->mm)
867        setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
868
869    exit_itimers(sig);
870    flush_itimer_signals();
871
872    if (atomic_read(&oldsighand->count) != 1) {
873        struct sighand_struct *newsighand;
874        /*
875         * This ->sighand is shared with the CLONE_SIGHAND
876         * but not CLONE_THREAD task, switch to the new one.
877         */
878        newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
879        if (!newsighand)
880            return -ENOMEM;
881
882        atomic_set(&newsighand->count, 1);
883        memcpy(newsighand->action, oldsighand->action,
884               sizeof(newsighand->action));
885
886        write_lock_irq(&tasklist_lock);
887        spin_lock(&oldsighand->siglock);
888        rcu_assign_pointer(tsk->sighand, newsighand);
889        spin_unlock(&oldsighand->siglock);
890        write_unlock_irq(&tasklist_lock);
891
892        __cleanup_sighand(oldsighand);
893    }
894
895    BUG_ON(!thread_group_leader(tsk));
896    return 0;
897}
898
899/*
900 * These functions flushes out all traces of the currently running executable
901 * so that a new one can be started
902 */
903static void flush_old_files(struct files_struct * files)
904{
905    long j = -1;
906    struct fdtable *fdt;
907
908    spin_lock(&files->file_lock);
909    for (;;) {
910        unsigned long set, i;
911
912        j++;
913        i = j * __NFDBITS;
914        fdt = files_fdtable(files);
915        if (i >= fdt->max_fds)
916            break;
917        set = fdt->close_on_exec->fds_bits[j];
918        if (!set)
919            continue;
920        fdt->close_on_exec->fds_bits[j] = 0;
921        spin_unlock(&files->file_lock);
922        for ( ; set ; i++,set >>= 1) {
923            if (set & 1) {
924                sys_close(i);
925            }
926        }
927        spin_lock(&files->file_lock);
928
929    }
930    spin_unlock(&files->file_lock);
931}
932
933char *get_task_comm(char *buf, struct task_struct *tsk)
934{
935    /* buf must be at least sizeof(tsk->comm) in size */
936    task_lock(tsk);
937    strncpy(buf, tsk->comm, sizeof(tsk->comm));
938    task_unlock(tsk);
939    return buf;
940}
941
942void set_task_comm(struct task_struct *tsk, char *buf)
943{
944    task_lock(tsk);
945
946    /*
947     * Threads may access current->comm without holding
948     * the task lock, so write the string carefully.
949     * Readers without a lock may see incomplete new
950     * names but are safe from non-terminating string reads.
951     */
952    memset(tsk->comm, 0, TASK_COMM_LEN);
953    wmb();
954    strlcpy(tsk->comm, buf, sizeof(tsk->comm));
955    task_unlock(tsk);
956    perf_event_comm(tsk);
957}
958
959int flush_old_exec(struct linux_binprm * bprm)
960{
961    int retval;
962
963    /*
964     * Make sure we have a private signal table and that
965     * we are unassociated from the previous thread group.
966     */
967    retval = de_thread(current);
968    if (retval)
969        goto out;
970
971    set_mm_exe_file(bprm->mm, bprm->file);
972
973    /*
974     * Release all of the old mmap stuff
975     */
976    retval = exec_mmap(bprm->mm);
977    if (retval)
978        goto out;
979
980    bprm->mm = NULL; /* We're using it now */
981
982    current->flags &= ~PF_RANDOMIZE;
983    flush_thread();
984    current->personality &= ~bprm->per_clear;
985
986    return 0;
987
988out:
989    return retval;
990}
991EXPORT_SYMBOL(flush_old_exec);
992
993void setup_new_exec(struct linux_binprm * bprm)
994{
995    int i, ch;
996    char * name;
997    char tcomm[sizeof(current->comm)];
998
999    arch_pick_mmap_layout(current->mm);
1000
1001    /* This is the point of no return */
1002    current->sas_ss_sp = current->sas_ss_size = 0;
1003
1004    if (current_euid() == current_uid() && current_egid() == current_gid())
1005        set_dumpable(current->mm, 1);
1006    else
1007        set_dumpable(current->mm, suid_dumpable);
1008
1009    name = bprm->filename;
1010
1011    /* Copies the binary name from after last slash */
1012    for (i=0; (ch = *(name++)) != '\0';) {
1013        if (ch == '/')
1014            i = 0; /* overwrite what we wrote */
1015        else
1016            if (i < (sizeof(tcomm) - 1))
1017                tcomm[i++] = ch;
1018    }
1019    tcomm[i] = '\0';
1020    set_task_comm(current, tcomm);
1021
1022    /* Set the new mm task size. We have to do that late because it may
1023     * depend on TIF_32BIT which is only updated in flush_thread() on
1024     * some architectures like powerpc
1025     */
1026    current->mm->task_size = TASK_SIZE;
1027
1028    /* install the new credentials */
1029    if (bprm->cred->uid != current_euid() ||
1030        bprm->cred->gid != current_egid()) {
1031        current->pdeath_signal = 0;
1032    } else if (file_permission(bprm->file, MAY_READ) ||
1033           bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) {
1034        set_dumpable(current->mm, suid_dumpable);
1035    }
1036
1037    /*
1038     * Flush performance counters when crossing a
1039     * security domain:
1040     */
1041    if (!get_dumpable(current->mm))
1042        perf_event_exit_task(current);
1043
1044    /* An exec changes our domain. We are no longer part of the thread
1045       group */
1046
1047    current->self_exec_id++;
1048            
1049    flush_signal_handlers(current, 0);
1050    flush_old_files(current->files);
1051}
1052EXPORT_SYMBOL(setup_new_exec);
1053
1054/*
1055 * Prepare credentials and lock ->cred_guard_mutex.
1056 * install_exec_creds() commits the new creds and drops the lock.
1057 * Or, if exec fails before, free_bprm() should release ->cred and
1058 * and unlock.
1059 */
1060int prepare_bprm_creds(struct linux_binprm *bprm)
1061{
1062    if (mutex_lock_interruptible(&current->cred_guard_mutex))
1063        return -ERESTARTNOINTR;
1064
1065    bprm->cred = prepare_exec_creds();
1066    if (likely(bprm->cred))
1067        return 0;
1068
1069    mutex_unlock(&current->cred_guard_mutex);
1070    return -ENOMEM;
1071}
1072
1073void free_bprm(struct linux_binprm *bprm)
1074{
1075    free_arg_pages(bprm);
1076    if (bprm->cred) {
1077        mutex_unlock(&current->cred_guard_mutex);
1078        abort_creds(bprm->cred);
1079    }
1080    kfree(bprm);
1081}
1082
1083/*
1084 * install the new credentials for this executable
1085 */
1086void install_exec_creds(struct linux_binprm *bprm)
1087{
1088    security_bprm_committing_creds(bprm);
1089
1090    commit_creds(bprm->cred);
1091    bprm->cred = NULL;
1092    /*
1093     * cred_guard_mutex must be held at least to this point to prevent
1094     * ptrace_attach() from altering our determination of the task's
1095     * credentials; any time after this it may be unlocked.
1096     */
1097    security_bprm_committed_creds(bprm);
1098    mutex_unlock(&current->cred_guard_mutex);
1099}
1100EXPORT_SYMBOL(install_exec_creds);
1101
1102/*
1103 * determine how safe it is to execute the proposed program
1104 * - the caller must hold current->cred_guard_mutex to protect against
1105 * PTRACE_ATTACH
1106 */
1107int check_unsafe_exec(struct linux_binprm *bprm)
1108{
1109    struct task_struct *p = current, *t;
1110    unsigned n_fs;
1111    int res = 0;
1112
1113    bprm->unsafe = tracehook_unsafe_exec(p);
1114
1115    n_fs = 1;
1116    write_lock(&p->fs->lock);
1117    rcu_read_lock();
1118    for (t = next_thread(p); t != p; t = next_thread(t)) {
1119        if (t->fs == p->fs)
1120            n_fs++;
1121    }
1122    rcu_read_unlock();
1123
1124    if (p->fs->users > n_fs) {
1125        bprm->unsafe |= LSM_UNSAFE_SHARE;
1126    } else {
1127        res = -EAGAIN;
1128        if (!p->fs->in_exec) {
1129            p->fs->in_exec = 1;
1130            res = 1;
1131        }
1132    }
1133    write_unlock(&p->fs->lock);
1134
1135    return res;
1136}
1137
1138/*
1139 * Fill the binprm structure from the inode.
1140 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
1141 *
1142 * This may be called multiple times for binary chains (scripts for example).
1143 */
1144int prepare_binprm(struct linux_binprm *bprm)
1145{
1146    umode_t mode;
1147    struct inode * inode = bprm->file->f_path.dentry->d_inode;
1148    int retval;
1149
1150    mode = inode->i_mode;
1151    if (bprm->file->f_op == NULL)
1152        return -EACCES;
1153
1154    /* clear any previous set[ug]id data from a previous binary */
1155    bprm->cred->euid = current_euid();
1156    bprm->cred->egid = current_egid();
1157
1158    if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
1159        /* Set-uid? */
1160        if (mode & S_ISUID) {
1161            bprm->per_clear |= PER_CLEAR_ON_SETID;
1162            bprm->cred->euid = inode->i_uid;
1163        }
1164
1165        /* Set-gid? */
1166        /*
1167         * If setgid is set but no group execute bit then this
1168         * is a candidate for mandatory locking, not a setgid
1169         * executable.
1170         */
1171        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1172            bprm->per_clear |= PER_CLEAR_ON_SETID;
1173            bprm->cred->egid = inode->i_gid;
1174        }
1175    }
1176
1177    /* fill in binprm security blob */
1178    retval = security_bprm_set_creds(bprm);
1179    if (retval)
1180        return retval;
1181    bprm->cred_prepared = 1;
1182
1183    memset(bprm->buf, 0, BINPRM_BUF_SIZE);
1184    return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
1185}
1186
1187EXPORT_SYMBOL(prepare_binprm);
1188
1189/*
1190 * Arguments are '\0' separated strings found at the location bprm->p
1191 * points to; chop off the first by relocating brpm->p to right after
1192 * the first '\0' encountered.
1193 */
1194int remove_arg_zero(struct linux_binprm *bprm)
1195{
1196    int ret = 0;
1197    unsigned long offset;
1198    char *kaddr;
1199    struct page *page;
1200
1201    if (!bprm->argc)
1202        return 0;
1203
1204    do {
1205        offset = bprm->p & ~PAGE_MASK;
1206        page = get_arg_page(bprm, bprm->p, 0);
1207        if (!page) {
1208            ret = -EFAULT;
1209            goto out;
1210        }
1211        kaddr = kmap_atomic(page, KM_USER0);
1212
1213        for (; offset < PAGE_SIZE && kaddr[offset];
1214                offset++, bprm->p++)
1215            ;
1216
1217        kunmap_atomic(kaddr, KM_USER0);
1218        put_arg_page(page);
1219
1220        if (offset == PAGE_SIZE)
1221            free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
1222    } while (offset == PAGE_SIZE);
1223
1224    bprm->p++;
1225    bprm->argc--;
1226    ret = 0;
1227
1228out:
1229    return ret;
1230}
1231EXPORT_SYMBOL(remove_arg_zero);
1232
1233/*
1234 * cycle the list of binary formats handler, until one recognizes the image
1235 */
1236int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1237{
1238    unsigned int depth = bprm->recursion_depth;
1239    int try,retval;
1240    struct linux_binfmt *fmt;
1241
1242    retval = security_bprm_check(bprm);
1243    if (retval)
1244        return retval;
1245
1246    /* kernel module loader fixup */
1247    /* so we don't try to load run modprobe in kernel space. */
1248    set_fs(USER_DS);
1249
1250    retval = audit_bprm(bprm);
1251    if (retval)
1252        return retval;
1253
1254    retval = -ENOENT;
1255    for (try=0; try<2; try++) {
1256        read_lock(&binfmt_lock);
1257        list_for_each_entry(fmt, &formats, lh) {
1258            int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
1259            if (!fn)
1260                continue;
1261            if (!try_module_get(fmt->module))
1262                continue;
1263            read_unlock(&binfmt_lock);
1264            retval = fn(bprm, regs);
1265            /*
1266             * Restore the depth counter to its starting value
1267             * in this call, so we don't have to rely on every
1268             * load_binary function to restore it on return.
1269             */
1270            bprm->recursion_depth = depth;
1271            if (retval >= 0) {
1272                if (depth == 0)
1273                    tracehook_report_exec(fmt, bprm, regs);
1274                put_binfmt(fmt);
1275                allow_write_access(bprm->file);
1276                if (bprm->file)
1277                    fput(bprm->file);
1278                bprm->file = NULL;
1279                current->did_exec = 1;
1280                proc_exec_connector(current);
1281                return retval;
1282            }
1283            read_lock(&binfmt_lock);
1284            put_binfmt(fmt);
1285            if (retval != -ENOEXEC || bprm->mm == NULL)
1286                break;
1287            if (!bprm->file) {
1288                read_unlock(&binfmt_lock);
1289                return retval;
1290            }
1291        }
1292        read_unlock(&binfmt_lock);
1293        if (retval != -ENOEXEC || bprm->mm == NULL) {
1294            break;
1295#ifdef CONFIG_MODULES
1296        } else {
1297#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
1298            if (printable(bprm->buf[0]) &&
1299                printable(bprm->buf[1]) &&
1300                printable(bprm->buf[2]) &&
1301                printable(bprm->buf[3]))
1302                break; /* -ENOEXEC */
1303            request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
1304#endif
1305        }
1306    }
1307    return retval;
1308}
1309
1310EXPORT_SYMBOL(search_binary_handler);
1311
1312/*
1313 * sys_execve() executes a new program.
1314 */
1315int do_execve(char * filename,
1316    char __user *__user *argv,
1317    char __user *__user *envp,
1318    struct pt_regs * regs)
1319{
1320    struct linux_binprm *bprm;
1321    struct file *file;
1322    struct files_struct *displaced;
1323    bool clear_in_exec;
1324    int retval;
1325
1326    retval = unshare_files(&displaced);
1327    if (retval)
1328        goto out_ret;
1329
1330    retval = -ENOMEM;
1331    bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
1332    if (!bprm)
1333        goto out_files;
1334
1335    retval = prepare_bprm_creds(bprm);
1336    if (retval)
1337        goto out_free;
1338
1339    retval = check_unsafe_exec(bprm);
1340    if (retval < 0)
1341        goto out_free;
1342    clear_in_exec = retval;
1343    current->in_execve = 1;
1344
1345    file = open_exec(filename);
1346    retval = PTR_ERR(file);
1347    if (IS_ERR(file))
1348        goto out_unmark;
1349
1350    sched_exec();
1351
1352    bprm->file = file;
1353    bprm->filename = filename;
1354    bprm->interp = filename;
1355
1356    retval = bprm_mm_init(bprm);
1357    if (retval)
1358        goto out_file;
1359
1360    bprm->argc = count(argv, MAX_ARG_STRINGS);
1361    if ((retval = bprm->argc) < 0)
1362        goto out;
1363
1364    bprm->envc = count(envp, MAX_ARG_STRINGS);
1365    if ((retval = bprm->envc) < 0)
1366        goto out;
1367
1368    retval = prepare_binprm(bprm);
1369    if (retval < 0)
1370        goto out;
1371
1372    retval = copy_strings_kernel(1, &bprm->filename, bprm);
1373    if (retval < 0)
1374        goto out;
1375
1376    bprm->exec = bprm->p;
1377    retval = copy_strings(bprm->envc, envp, bprm);
1378    if (retval < 0)
1379        goto out;
1380
1381    retval = copy_strings(bprm->argc, argv, bprm);
1382    if (retval < 0)
1383        goto out;
1384
1385    current->flags &= ~PF_KTHREAD;
1386    retval = search_binary_handler(bprm,regs);
1387    if (retval < 0)
1388        goto out;
1389
1390    current->stack_start = current->mm->start_stack;
1391
1392    /* execve succeeded */
1393    current->fs->in_exec = 0;
1394    current->in_execve = 0;
1395    acct_update_integrals(current);
1396    free_bprm(bprm);
1397    if (displaced)
1398        put_files_struct(displaced);
1399    return retval;
1400
1401out:
1402    if (bprm->mm)
1403        mmput (bprm->mm);
1404
1405out_file:
1406    if (bprm->file) {
1407        allow_write_access(bprm->file);
1408        fput(bprm->file);
1409    }
1410
1411out_unmark:
1412    if (clear_in_exec)
1413        current->fs->in_exec = 0;
1414    current->in_execve = 0;
1415
1416out_free:
1417    free_bprm(bprm);
1418
1419out_files:
1420    if (displaced)
1421        reset_files_struct(displaced);
1422out_ret:
1423    return retval;
1424}
1425
1426void set_binfmt(struct linux_binfmt *new)
1427{
1428    struct mm_struct *mm = current->mm;
1429
1430    if (mm->binfmt)
1431        module_put(mm->binfmt->module);
1432
1433    mm->binfmt = new;
1434    if (new)
1435        __module_get(new->module);
1436}
1437
1438EXPORT_SYMBOL(set_binfmt);
1439
1440/* format_corename will inspect the pattern parameter, and output a
1441 * name into corename, which must have space for at least
1442 * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
1443 */
1444static int format_corename(char *corename, long signr)
1445{
1446    const struct cred *cred = current_cred();
1447    const char *pat_ptr = core_pattern;
1448    int ispipe = (*pat_ptr == '|');
1449    char *out_ptr = corename;
1450    char *const out_end = corename + CORENAME_MAX_SIZE;
1451    int rc;
1452    int pid_in_pattern = 0;
1453
1454    /* Repeat as long as we have more pattern to process and more output
1455       space */
1456    while (*pat_ptr) {
1457        if (*pat_ptr != '%') {
1458            if (out_ptr == out_end)
1459                goto out;
1460            *out_ptr++ = *pat_ptr++;
1461        } else {
1462            switch (*++pat_ptr) {
1463            case 0:
1464                goto out;
1465            /* Double percent, output one percent */
1466            case '%':
1467                if (out_ptr == out_end)
1468                    goto out;
1469                *out_ptr++ = '%';
1470                break;
1471            /* pid */
1472            case 'p':
1473                pid_in_pattern = 1;
1474                rc = snprintf(out_ptr, out_end - out_ptr,
1475                          "%d", task_tgid_vnr(current));
1476                if (rc > out_end - out_ptr)
1477                    goto out;
1478                out_ptr += rc;
1479                break;
1480            /* uid */
1481            case 'u':
1482                rc = snprintf(out_ptr, out_end - out_ptr,
1483                          "%d", cred->uid);
1484                if (rc > out_end - out_ptr)
1485                    goto out;
1486                out_ptr += rc;
1487                break;
1488            /* gid */
1489            case 'g':
1490                rc = snprintf(out_ptr, out_end - out_ptr,
1491                          "%d", cred->gid);
1492                if (rc > out_end - out_ptr)
1493                    goto out;
1494                out_ptr += rc;
1495                break;
1496            /* signal that caused the coredump */
1497            case 's':
1498                rc = snprintf(out_ptr, out_end - out_ptr,
1499                          "%ld", signr);
1500                if (rc > out_end - out_ptr)
1501                    goto out;
1502                out_ptr += rc;
1503                break;
1504            /* UNIX time of coredump */
1505            case 't': {
1506                struct timeval tv;
1507                do_gettimeofday(&tv);
1508                rc = snprintf(out_ptr, out_end - out_ptr,
1509                          "%lu", tv.tv_sec);
1510                if (rc > out_end - out_ptr)
1511                    goto out;
1512                out_ptr += rc;
1513                break;
1514            }
1515            /* hostname */
1516            case 'h':
1517                down_read(&uts_sem);
1518                rc = snprintf(out_ptr, out_end - out_ptr,
1519                          "%s", utsname()->nodename);
1520                up_read(&uts_sem);
1521                if (rc > out_end - out_ptr)
1522                    goto out;
1523                out_ptr += rc;
1524                break;
1525            /* executable */
1526            case 'e':
1527                rc = snprintf(out_ptr, out_end - out_ptr,
1528                          "%s", current->comm);
1529                if (rc > out_end - out_ptr)
1530                    goto out;
1531                out_ptr += rc;
1532                break;
1533            /* core limit size */
1534            case 'c':
1535                rc = snprintf(out_ptr, out_end - out_ptr,
1536                          "%lu", rlimit(RLIMIT_CORE));
1537                if (rc > out_end - out_ptr)
1538                    goto out;
1539                out_ptr += rc;
1540                break;
1541            default:
1542                break;
1543            }
1544            ++pat_ptr;
1545        }
1546    }
1547    /* Backward compatibility with core_uses_pid:
1548     *
1549     * If core_pattern does not include a %p (as is the default)
1550     * and core_uses_pid is set, then .%pid will be appended to
1551     * the filename. Do not do this for piped commands. */
1552    if (!ispipe && !pid_in_pattern && core_uses_pid) {
1553        rc = snprintf(out_ptr, out_end - out_ptr,
1554                  ".%d", task_tgid_vnr(current));
1555        if (rc > out_end - out_ptr)
1556            goto out;
1557        out_ptr += rc;
1558    }
1559out:
1560    *out_ptr = 0;
1561    return ispipe;
1562}
1563
1564static int zap_process(struct task_struct *start, int exit_code)
1565{
1566    struct task_struct *t;
1567    int nr = 0;
1568
1569    start->signal->flags = SIGNAL_GROUP_EXIT;
1570    start->signal->group_exit_code = exit_code;
1571    start->signal->group_stop_count = 0;
1572
1573    t = start;
1574    do {
1575        if (t != current && t->mm) {
1576            sigaddset(&t->pending.signal, SIGKILL);
1577            signal_wake_up(t, 1);
1578            nr++;
1579        }
1580    } while_each_thread(start, t);
1581
1582    return nr;
1583}
1584
1585static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
1586                struct core_state *core_state, int exit_code)
1587{
1588    struct task_struct *g, *p;
1589    unsigned long flags;
1590    int nr = -EAGAIN;
1591
1592    spin_lock_irq(&tsk->sighand->siglock);
1593    if (!signal_group_exit(tsk->signal)) {
1594        mm->core_state = core_state;
1595        nr = zap_process(tsk, exit_code);
1596    }
1597    spin_unlock_irq(&tsk->sighand->siglock);
1598    if (unlikely(nr < 0))
1599        return nr;
1600
1601    if (atomic_read(&mm->mm_users) == nr + 1)
1602        goto done;
1603    /*
1604     * We should find and kill all tasks which use this mm, and we should
1605     * count them correctly into ->nr_threads. We don't take tasklist
1606     * lock, but this is safe wrt:
1607     *
1608     * fork:
1609     * None of sub-threads can fork after zap_process(leader). All
1610     * processes which were created before this point should be
1611     * visible to zap_threads() because copy_process() adds the new
1612     * process to the tail of init_task.tasks list, and lock/unlock
1613     * of ->siglock provides a memory barrier.
1614     *
1615     * do_exit:
1616     * The caller holds mm->mmap_sem. This means that the task which
1617     * uses this mm can't pass exit_mm(), so it can't exit or clear
1618     * its ->mm.
1619     *
1620     * de_thread:
1621     * It does list_replace_rcu(&leader->tasks, &current->tasks),
1622     * we must see either old or new leader, this does not matter.
1623     * However, it can change p->sighand, so lock_task_sighand(p)
1624     * must be used. Since p->mm != NULL and we hold ->mmap_sem
1625     * it can't fail.
1626     *
1627     * Note also that "g" can be the old leader with ->mm == NULL
1628     * and already unhashed and thus removed from ->thread_group.
1629     * This is OK, __unhash_process()->list_del_rcu() does not
1630     * clear the ->next pointer, we will find the new leader via
1631     * next_thread().
1632     */
1633    rcu_read_lock();
1634    for_each_process(g) {
1635        if (g == tsk->group_leader)
1636            continue;
1637        if (g->flags & PF_KTHREAD)
1638            continue;
1639        p = g;
1640        do {
1641            if (p->mm) {
1642                if (unlikely(p->mm == mm)) {
1643                    lock_task_sighand(p, &flags);
1644                    nr += zap_process(p, exit_code);
1645                    unlock_task_sighand(p, &flags);
1646                }
1647                break;
1648            }
1649        } while_each_thread(g, p);
1650    }
1651    rcu_read_unlock();
1652done:
1653    atomic_set(&core_state->nr_threads, nr);
1654    return nr;
1655}
1656
1657static int coredump_wait(int exit_code, struct core_state *core_state)
1658{
1659    struct task_struct *tsk = current;
1660    struct mm_struct *mm = tsk->mm;
1661    struct completion *vfork_done;
1662    int core_waiters;
1663
1664    init_completion(&core_state->startup);
1665    core_state->dumper.task = tsk;
1666    core_state->dumper.next = NULL;
1667    core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1668    up_write(&mm->mmap_sem);
1669
1670    if (unlikely(core_waiters < 0))
1671        goto fail;
1672
1673    /*
1674     * Make sure nobody is waiting for us to release the VM,
1675     * otherwise we can deadlock when we wait on each other
1676     */
1677    vfork_done = tsk->vfork_done;
1678    if (vfork_done) {
1679        tsk->vfork_done = NULL;
1680        complete(vfork_done);
1681    }
1682
1683    if (core_waiters)
1684        wait_for_completion(&core_state->startup);
1685fail:
1686    return core_waiters;
1687}
1688
1689static void coredump_finish(struct mm_struct *mm)
1690{
1691    struct core_thread *curr, *next;
1692    struct task_struct *task;
1693
1694    next = mm->core_state->dumper.next;
1695    while ((curr = next) != NULL) {
1696        next = curr->next;
1697        task = curr->task;
1698        /*
1699         * see exit_mm(), curr->task must not see
1700         * ->task == NULL before we read ->next.
1701         */
1702        smp_mb();
1703        curr->task = NULL;
1704        wake_up_process(task);
1705    }
1706
1707    mm->core_state = NULL;
1708}
1709
1710/*
1711 * set_dumpable converts traditional three-value dumpable to two flags and
1712 * stores them into mm->flags. It modifies lower two bits of mm->flags, but
1713 * these bits are not changed atomically. So get_dumpable can observe the
1714 * intermediate state. To avoid doing unexpected behavior, get get_dumpable
1715 * return either old dumpable or new one by paying attention to the order of
1716 * modifying the bits.
1717 *
1718 * dumpable | mm->flags (binary)
1719 * old new | initial interim final
1720 * ---------+-----------------------
1721 * 0 1 | 00 01 01
1722 * 0 2 | 00 10(*) 11
1723 * 1 0 | 01 00 00
1724 * 1 2 | 01 11 11
1725 * 2 0 | 11 10(*) 00
1726 * 2 1 | 11 11 01
1727 *
1728 * (*) get_dumpable regards interim value of 10 as 11.
1729 */
1730void set_dumpable(struct mm_struct *mm, int value)
1731{
1732    switch (value) {
1733    case 0:
1734        clear_bit(MMF_DUMPABLE, &mm->flags);
1735        smp_wmb();
1736        clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1737        break;
1738    case 1:
1739        set_bit(MMF_DUMPABLE, &mm->flags);
1740        smp_wmb();
1741        clear_bit(MMF_DUMP_SECURELY, &mm->flags);
1742        break;
1743    case 2:
1744        set_bit(MMF_DUMP_SECURELY, &mm->flags);
1745        smp_wmb();
1746        set_bit(MMF_DUMPABLE, &mm->flags);
1747        break;
1748    }
1749}
1750
1751static int __get_dumpable(unsigned long mm_flags)
1752{
1753    int ret;
1754
1755    ret = mm_flags & MMF_DUMPABLE_MASK;
1756    return (ret >= 2) ? 2 : ret;
1757}
1758
1759int get_dumpable(struct mm_struct *mm)
1760{
1761    return __get_dumpable(mm->flags);
1762}
1763
1764static void wait_for_dump_helpers(struct file *file)
1765{
1766    struct pipe_inode_info *pipe;
1767
1768    pipe = file->f_path.dentry->d_inode->i_pipe;
1769
1770    pipe_lock(pipe);
1771    pipe->readers++;
1772    pipe->writers--;
1773
1774    while ((pipe->readers > 1) && (!signal_pending(current))) {
1775        wake_up_interruptible_sync(&pipe->wait);
1776        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1777        pipe_wait(pipe);
1778    }
1779
1780    pipe->readers--;
1781    pipe->writers++;
1782    pipe_unlock(pipe);
1783
1784}
1785
1786
1787void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1788{
1789    struct core_state core_state;
1790    char corename[CORENAME_MAX_SIZE + 1];
1791    struct mm_struct *mm = current->mm;
1792    struct linux_binfmt * binfmt;
1793    struct inode * inode;
1794    const struct cred *old_cred;
1795    struct cred *cred;
1796    int retval = 0;
1797    int flag = 0;
1798    int ispipe = 0;
1799    char **helper_argv = NULL;
1800    int helper_argc = 0;
1801    int dump_count = 0;
1802    static atomic_t core_dump_count = ATOMIC_INIT(0);
1803    struct coredump_params cprm = {
1804        .signr = signr,
1805        .regs = regs,
1806        .limit = rlimit(RLIMIT_CORE),
1807        /*
1808         * We must use the same mm->flags while dumping core to avoid
1809         * inconsistency of bit flags, since this flag is not protected
1810         * by any locks.
1811         */
1812        .mm_flags = mm->flags,
1813    };
1814
1815    audit_core_dumps(signr);
1816
1817    binfmt = mm->binfmt;
1818    if (!binfmt || !binfmt->core_dump)
1819        goto fail;
1820
1821    cred = prepare_creds();
1822    if (!cred) {
1823        retval = -ENOMEM;
1824        goto fail;
1825    }
1826
1827    down_write(&mm->mmap_sem);
1828    /*
1829     * If another thread got here first, or we are not dumpable, bail out.
1830     */
1831    if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1832        up_write(&mm->mmap_sem);
1833        put_cred(cred);
1834        goto fail;
1835    }
1836
1837    /*
1838     * We cannot trust fsuid as being the "true" uid of the
1839     * process nor do we know its entire history. We only know it
1840     * was tainted so we dump it as root in mode 2.
1841     */
1842    if (__get_dumpable(cprm.mm_flags) == 2) {
1843        /* Setuid core dump mode */
1844        flag = O_EXCL; /* Stop rewrite attacks */
1845        cred->fsuid = 0; /* Dump root private */
1846    }
1847
1848    retval = coredump_wait(exit_code, &core_state);
1849    if (retval < 0) {
1850        put_cred(cred);
1851        goto fail;
1852    }
1853
1854    old_cred = override_creds(cred);
1855
1856    /*
1857     * Clear any false indication of pending signals that might
1858     * be seen by the filesystem code called to write the core file.
1859     */
1860    clear_thread_flag(TIF_SIGPENDING);
1861
1862    /*
1863     * lock_kernel() because format_corename() is controlled by sysctl, which
1864     * uses lock_kernel()
1865     */
1866     lock_kernel();
1867    ispipe = format_corename(corename, signr);
1868    unlock_kernel();
1869
1870    if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
1871        goto fail_unlock;
1872
1873     if (ispipe) {
1874        if (cprm.limit == 0) {
1875            /*
1876             * Normally core limits are irrelevant to pipes, since
1877             * we're not writing to the file system, but we use
1878             * cprm.limit of 0 here as a speacial value. Any
1879             * non-zero limit gets set to RLIM_INFINITY below, but
1880             * a limit of 0 skips the dump. This is a consistent
1881             * way to catch recursive crashes. We can still crash
1882             * if the core_pattern binary sets RLIM_CORE = !0
1883             * but it runs as root, and can do lots of stupid things
1884             * Note that we use task_tgid_vnr here to grab the pid
1885             * of the process group leader. That way we get the
1886             * right pid if a thread in a multi-threaded
1887             * core_pattern process dies.
1888             */
1889            printk(KERN_WARNING
1890                "Process %d(%s) has RLIMIT_CORE set to 0\n",
1891                task_tgid_vnr(current), current->comm);
1892            printk(KERN_WARNING "Aborting core\n");
1893            goto fail_unlock;
1894        }
1895
1896        dump_count = atomic_inc_return(&core_dump_count);
1897        if (core_pipe_limit && (core_pipe_limit < dump_count)) {
1898            printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
1899                   task_tgid_vnr(current), current->comm);
1900            printk(KERN_WARNING "Skipping core dump\n");
1901            goto fail_dropcount;
1902        }
1903
1904        helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1905        if (!helper_argv) {
1906            printk(KERN_WARNING "%s failed to allocate memory\n",
1907                   __func__);
1908            goto fail_dropcount;
1909        }
1910
1911        cprm.limit = RLIM_INFINITY;
1912
1913        /* SIGPIPE can happen, but it's just never processed */
1914        if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
1915                &cprm.file)) {
1916             printk(KERN_INFO "Core dump to %s pipe failed\n",
1917                   corename);
1918            goto fail_dropcount;
1919         }
1920     } else
1921        cprm.file = filp_open(corename,
1922                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1923                 0600);
1924    if (IS_ERR(cprm.file))
1925        goto fail_dropcount;
1926    inode = cprm.file->f_path.dentry->d_inode;
1927    if (inode->i_nlink > 1)
1928        goto close_fail; /* multiple links - don't dump */
1929    if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
1930        goto close_fail;
1931
1932    /* AK: actually i see no reason to not allow this for named pipes etc.,
1933       but keep the previous behaviour for now. */
1934    if (!ispipe && !S_ISREG(inode->i_mode))
1935        goto close_fail;
1936    /*
1937     * Dont allow local users get cute and trick others to coredump
1938     * into their pre-created files:
1939     * Note, this is not relevant for pipes
1940     */
1941    if (!ispipe && (inode->i_uid != current_fsuid()))
1942        goto close_fail;
1943    if (!cprm.file->f_op)
1944        goto close_fail;
1945    if (!cprm.file->f_op->write)
1946        goto close_fail;
1947    if (!ispipe &&
1948        do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
1949        goto close_fail;
1950
1951    retval = binfmt->core_dump(&cprm);
1952
1953    if (retval)
1954        current->signal->group_exit_code |= 0x80;
1955close_fail:
1956    if (ispipe && core_pipe_limit)
1957        wait_for_dump_helpers(cprm.file);
1958    filp_close(cprm.file, NULL);
1959fail_dropcount:
1960    if (dump_count)
1961        atomic_dec(&core_dump_count);
1962fail_unlock:
1963    if (helper_argv)
1964        argv_free(helper_argv);
1965
1966    revert_creds(old_cred);
1967    put_cred(cred);
1968    coredump_finish(mm);
1969fail:
1970    return;
1971}
1972

Archive Download this file



interactive