Root/kernel/exit.c

1/*
2 * linux/kernel/exit.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7#include <linux/mm.h>
8#include <linux/slab.h>
9#include <linux/interrupt.h>
10#include <linux/module.h>
11#include <linux/capability.h>
12#include <linux/completion.h>
13#include <linux/personality.h>
14#include <linux/tty.h>
15#include <linux/iocontext.h>
16#include <linux/key.h>
17#include <linux/security.h>
18#include <linux/cpu.h>
19#include <linux/acct.h>
20#include <linux/tsacct_kern.h>
21#include <linux/file.h>
22#include <linux/fdtable.h>
23#include <linux/binfmts.h>
24#include <linux/nsproxy.h>
25#include <linux/pid_namespace.h>
26#include <linux/ptrace.h>
27#include <linux/profile.h>
28#include <linux/mount.h>
29#include <linux/proc_fs.h>
30#include <linux/kthread.h>
31#include <linux/mempolicy.h>
32#include <linux/taskstats_kern.h>
33#include <linux/delayacct.h>
34#include <linux/freezer.h>
35#include <linux/cgroup.h>
36#include <linux/syscalls.h>
37#include <linux/signal.h>
38#include <linux/posix-timers.h>
39#include <linux/cn_proc.h>
40#include <linux/mutex.h>
41#include <linux/futex.h>
42#include <linux/pipe_fs_i.h>
43#include <linux/audit.h> /* for audit_free() */
44#include <linux/resource.h>
45#include <linux/blkdev.h>
46#include <linux/task_io_accounting_ops.h>
47#include <linux/tracehook.h>
48#include <linux/fs_struct.h>
49#include <linux/init_task.h>
50#include <linux/perf_event.h>
51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h>
53
54#include <asm/uaccess.h>
55#include <asm/unistd.h>
56#include <asm/pgtable.h>
57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59
60static void exit_mm(struct task_struct * tsk);
61
62static void __unhash_process(struct task_struct *p)
63{
64    nr_threads--;
65    detach_pid(p, PIDTYPE_PID);
66    if (thread_group_leader(p)) {
67        detach_pid(p, PIDTYPE_PGID);
68        detach_pid(p, PIDTYPE_SID);
69
70        list_del_rcu(&p->tasks);
71        list_del_init(&p->sibling);
72        __get_cpu_var(process_counts)--;
73    }
74    list_del_rcu(&p->thread_group);
75}
76
77/*
78 * This function expects the tasklist_lock write-locked.
79 */
80static void __exit_signal(struct task_struct *tsk)
81{
82    struct signal_struct *sig = tsk->signal;
83    struct sighand_struct *sighand;
84
85    BUG_ON(!sig);
86    BUG_ON(!atomic_read(&sig->count));
87
88    sighand = rcu_dereference_check(tsk->sighand,
89                    rcu_read_lock_held() ||
90                    lockdep_tasklist_lock_is_held());
91    spin_lock(&sighand->siglock);
92
93    posix_cpu_timers_exit(tsk);
94    if (atomic_dec_and_test(&sig->count))
95        posix_cpu_timers_exit_group(tsk);
96    else {
97        /*
98         * If there is any task waiting for the group exit
99         * then notify it:
100         */
101        if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count)
102            wake_up_process(sig->group_exit_task);
103
104        if (tsk == sig->curr_target)
105            sig->curr_target = next_thread(tsk);
106        /*
107         * Accumulate here the counters for all threads but the
108         * group leader as they die, so they can be added into
109         * the process-wide totals when those are taken.
110         * The group leader stays around as a zombie as long
111         * as there are other threads. When it gets reaped,
112         * the exit.c code will add its counts into these totals.
113         * We won't ever get here for the group leader, since it
114         * will have been the last reference on the signal_struct.
115         */
116        sig->utime = cputime_add(sig->utime, tsk->utime);
117        sig->stime = cputime_add(sig->stime, tsk->stime);
118        sig->gtime = cputime_add(sig->gtime, tsk->gtime);
119        sig->min_flt += tsk->min_flt;
120        sig->maj_flt += tsk->maj_flt;
121        sig->nvcsw += tsk->nvcsw;
122        sig->nivcsw += tsk->nivcsw;
123        sig->inblock += task_io_get_inblock(tsk);
124        sig->oublock += task_io_get_oublock(tsk);
125        task_io_accounting_add(&sig->ioac, &tsk->ioac);
126        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
127        sig = NULL; /* Marker for below. */
128    }
129
130    __unhash_process(tsk);
131
132    /*
133     * Do this under ->siglock, we can race with another thread
134     * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
135     */
136    flush_sigqueue(&tsk->pending);
137
138    tsk->signal = NULL;
139    tsk->sighand = NULL;
140    spin_unlock(&sighand->siglock);
141
142    __cleanup_sighand(sighand);
143    clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
144    if (sig) {
145        flush_sigqueue(&sig->shared_pending);
146        taskstats_tgid_free(sig);
147        /*
148         * Make sure ->signal can't go away under rq->lock,
149         * see account_group_exec_runtime().
150         */
151        task_rq_unlock_wait(tsk);
152        __cleanup_signal(sig);
153    }
154}
155
156static void delayed_put_task_struct(struct rcu_head *rhp)
157{
158    struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
159
160#ifdef CONFIG_PERF_EVENTS
161    WARN_ON_ONCE(tsk->perf_event_ctxp);
162#endif
163    trace_sched_process_free(tsk);
164    put_task_struct(tsk);
165}
166
167
168void release_task(struct task_struct * p)
169{
170    struct task_struct *leader;
171    int zap_leader;
172repeat:
173    tracehook_prepare_release_task(p);
174    /* don't need to get the RCU readlock here - the process is dead and
175     * can't be modifying its own credentials. But shut RCU-lockdep up */
176    rcu_read_lock();
177    atomic_dec(&__task_cred(p)->user->processes);
178    rcu_read_unlock();
179
180    proc_flush_task(p);
181
182    write_lock_irq(&tasklist_lock);
183    tracehook_finish_release_task(p);
184    __exit_signal(p);
185
186    /*
187     * If we are the last non-leader member of the thread
188     * group, and the leader is zombie, then notify the
189     * group leader's parent process. (if it wants notification.)
190     */
191    zap_leader = 0;
192    leader = p->group_leader;
193    if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
194        BUG_ON(task_detached(leader));
195        do_notify_parent(leader, leader->exit_signal);
196        /*
197         * If we were the last child thread and the leader has
198         * exited already, and the leader's parent ignores SIGCHLD,
199         * then we are the one who should release the leader.
200         *
201         * do_notify_parent() will have marked it self-reaping in
202         * that case.
203         */
204        zap_leader = task_detached(leader);
205
206        /*
207         * This maintains the invariant that release_task()
208         * only runs on a task in EXIT_DEAD, just for sanity.
209         */
210        if (zap_leader)
211            leader->exit_state = EXIT_DEAD;
212    }
213
214    write_unlock_irq(&tasklist_lock);
215    release_thread(p);
216    call_rcu(&p->rcu, delayed_put_task_struct);
217
218    p = leader;
219    if (unlikely(zap_leader))
220        goto repeat;
221}
222
223/*
224 * This checks not only the pgrp, but falls back on the pid if no
225 * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
226 * without this...
227 *
228 * The caller must hold rcu lock or the tasklist lock.
229 */
230struct pid *session_of_pgrp(struct pid *pgrp)
231{
232    struct task_struct *p;
233    struct pid *sid = NULL;
234
235    p = pid_task(pgrp, PIDTYPE_PGID);
236    if (p == NULL)
237        p = pid_task(pgrp, PIDTYPE_PID);
238    if (p != NULL)
239        sid = task_session(p);
240
241    return sid;
242}
243
244/*
245 * Determine if a process group is "orphaned", according to the POSIX
246 * definition in 2.2.2.52. Orphaned process groups are not to be affected
247 * by terminal-generated stop signals. Newly orphaned process groups are
248 * to receive a SIGHUP and a SIGCONT.
249 *
250 * "I ask you, have you ever known what it is to be an orphan?"
251 */
252static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
253{
254    struct task_struct *p;
255
256    do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
257        if ((p == ignored_task) ||
258            (p->exit_state && thread_group_empty(p)) ||
259            is_global_init(p->real_parent))
260            continue;
261
262        if (task_pgrp(p->real_parent) != pgrp &&
263            task_session(p->real_parent) == task_session(p))
264            return 0;
265    } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
266
267    return 1;
268}
269
270int is_current_pgrp_orphaned(void)
271{
272    int retval;
273
274    read_lock(&tasklist_lock);
275    retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
276    read_unlock(&tasklist_lock);
277
278    return retval;
279}
280
281static int has_stopped_jobs(struct pid *pgrp)
282{
283    int retval = 0;
284    struct task_struct *p;
285
286    do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
287        if (!task_is_stopped(p))
288            continue;
289        retval = 1;
290        break;
291    } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
292    return retval;
293}
294
295/*
296 * Check to see if any process groups have become orphaned as
297 * a result of our exiting, and if they have any stopped jobs,
298 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
299 */
300static void
301kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
302{
303    struct pid *pgrp = task_pgrp(tsk);
304    struct task_struct *ignored_task = tsk;
305
306    if (!parent)
307         /* exit: our father is in a different pgrp than
308          * we are and we were the only connection outside.
309          */
310        parent = tsk->real_parent;
311    else
312        /* reparent: our child is in a different pgrp than
313         * we are, and it was the only connection outside.
314         */
315        ignored_task = NULL;
316
317    if (task_pgrp(parent) != pgrp &&
318        task_session(parent) == task_session(tsk) &&
319        will_become_orphaned_pgrp(pgrp, ignored_task) &&
320        has_stopped_jobs(pgrp)) {
321        __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
322        __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
323    }
324}
325
326/**
327 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
328 *
329 * If a kernel thread is launched as a result of a system call, or if
330 * it ever exits, it should generally reparent itself to kthreadd so it
331 * isn't in the way of other processes and is correctly cleaned up on exit.
332 *
333 * The various task state such as scheduling policy and priority may have
334 * been inherited from a user process, so we reset them to sane values here.
335 *
336 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
337 */
338static void reparent_to_kthreadd(void)
339{
340    write_lock_irq(&tasklist_lock);
341
342    ptrace_unlink(current);
343    /* Reparent to init */
344    current->real_parent = current->parent = kthreadd_task;
345    list_move_tail(&current->sibling, &current->real_parent->children);
346
347    /* Set the exit signal to SIGCHLD so we signal init on exit */
348    current->exit_signal = SIGCHLD;
349
350    if (task_nice(current) < 0)
351        set_user_nice(current, 0);
352    /* cpus_allowed? */
353    /* rt_priority? */
354    /* signals? */
355    memcpy(current->signal->rlim, init_task.signal->rlim,
356           sizeof(current->signal->rlim));
357
358    atomic_inc(&init_cred.usage);
359    commit_creds(&init_cred);
360    write_unlock_irq(&tasklist_lock);
361}
362
363void __set_special_pids(struct pid *pid)
364{
365    struct task_struct *curr = current->group_leader;
366
367    if (task_session(curr) != pid)
368        change_pid(curr, PIDTYPE_SID, pid);
369
370    if (task_pgrp(curr) != pid)
371        change_pid(curr, PIDTYPE_PGID, pid);
372}
373
374static void set_special_pids(struct pid *pid)
375{
376    write_lock_irq(&tasklist_lock);
377    __set_special_pids(pid);
378    write_unlock_irq(&tasklist_lock);
379}
380
381/*
382 * Let kernel threads use this to say that they allow a certain signal.
383 * Must not be used if kthread was cloned with CLONE_SIGHAND.
384 */
385int allow_signal(int sig)
386{
387    if (!valid_signal(sig) || sig < 1)
388        return -EINVAL;
389
390    spin_lock_irq(&current->sighand->siglock);
391    /* This is only needed for daemonize()'ed kthreads */
392    sigdelset(&current->blocked, sig);
393    /*
394     * Kernel threads handle their own signals. Let the signal code
395     * know it'll be handled, so that they don't get converted to
396     * SIGKILL or just silently dropped.
397     */
398    current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
399    recalc_sigpending();
400    spin_unlock_irq(&current->sighand->siglock);
401    return 0;
402}
403
404EXPORT_SYMBOL(allow_signal);
405
406int disallow_signal(int sig)
407{
408    if (!valid_signal(sig) || sig < 1)
409        return -EINVAL;
410
411    spin_lock_irq(&current->sighand->siglock);
412    current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
413    recalc_sigpending();
414    spin_unlock_irq(&current->sighand->siglock);
415    return 0;
416}
417
418EXPORT_SYMBOL(disallow_signal);
419
420/*
421 * Put all the gunge required to become a kernel thread without
422 * attached user resources in one place where it belongs.
423 */
424
425void daemonize(const char *name, ...)
426{
427    va_list args;
428    sigset_t blocked;
429
430    va_start(args, name);
431    vsnprintf(current->comm, sizeof(current->comm), name, args);
432    va_end(args);
433
434    /*
435     * If we were started as result of loading a module, close all of the
436     * user space pages. We don't need them, and if we didn't close them
437     * they would be locked into memory.
438     */
439    exit_mm(current);
440    /*
441     * We don't want to have TIF_FREEZE set if the system-wide hibernation
442     * or suspend transition begins right now.
443     */
444    current->flags |= (PF_NOFREEZE | PF_KTHREAD);
445
446    if (current->nsproxy != &init_nsproxy) {
447        get_nsproxy(&init_nsproxy);
448        switch_task_namespaces(current, &init_nsproxy);
449    }
450    set_special_pids(&init_struct_pid);
451    proc_clear_tty(current);
452
453    /* Block and flush all signals */
454    sigfillset(&blocked);
455    sigprocmask(SIG_BLOCK, &blocked, NULL);
456    flush_signals(current);
457
458    /* Become as one with the init task */
459
460    daemonize_fs_struct();
461    exit_files(current);
462    current->files = init_task.files;
463    atomic_inc(&current->files->count);
464
465    reparent_to_kthreadd();
466}
467
468EXPORT_SYMBOL(daemonize);
469
470static void close_files(struct files_struct * files)
471{
472    int i, j;
473    struct fdtable *fdt;
474
475    j = 0;
476
477    /*
478     * It is safe to dereference the fd table without RCU or
479     * ->file_lock because this is the last reference to the
480     * files structure. But use RCU to shut RCU-lockdep up.
481     */
482    rcu_read_lock();
483    fdt = files_fdtable(files);
484    rcu_read_unlock();
485    for (;;) {
486        unsigned long set;
487        i = j * __NFDBITS;
488        if (i >= fdt->max_fds)
489            break;
490        set = fdt->open_fds->fds_bits[j++];
491        while (set) {
492            if (set & 1) {
493                struct file * file = xchg(&fdt->fd[i], NULL);
494                if (file) {
495                    filp_close(file, files);
496                    cond_resched();
497                }
498            }
499            i++;
500            set >>= 1;
501        }
502    }
503}
504
505struct files_struct *get_files_struct(struct task_struct *task)
506{
507    struct files_struct *files;
508
509    task_lock(task);
510    files = task->files;
511    if (files)
512        atomic_inc(&files->count);
513    task_unlock(task);
514
515    return files;
516}
517
518void put_files_struct(struct files_struct *files)
519{
520    struct fdtable *fdt;
521
522    if (atomic_dec_and_test(&files->count)) {
523        close_files(files);
524        /*
525         * Free the fd and fdset arrays if we expanded them.
526         * If the fdtable was embedded, pass files for freeing
527         * at the end of the RCU grace period. Otherwise,
528         * you can free files immediately.
529         */
530        rcu_read_lock();
531        fdt = files_fdtable(files);
532        if (fdt != &files->fdtab)
533            kmem_cache_free(files_cachep, files);
534        free_fdtable(fdt);
535        rcu_read_unlock();
536    }
537}
538
539void reset_files_struct(struct files_struct *files)
540{
541    struct task_struct *tsk = current;
542    struct files_struct *old;
543
544    old = tsk->files;
545    task_lock(tsk);
546    tsk->files = files;
547    task_unlock(tsk);
548    put_files_struct(old);
549}
550
551void exit_files(struct task_struct *tsk)
552{
553    struct files_struct * files = tsk->files;
554
555    if (files) {
556        task_lock(tsk);
557        tsk->files = NULL;
558        task_unlock(tsk);
559        put_files_struct(files);
560    }
561}
562
563#ifdef CONFIG_MM_OWNER
564/*
565 * Task p is exiting and it owned mm, lets find a new owner for it
566 */
567static inline int
568mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
569{
570    /*
571     * If there are other users of the mm and the owner (us) is exiting
572     * we need to find a new owner to take on the responsibility.
573     */
574    if (atomic_read(&mm->mm_users) <= 1)
575        return 0;
576    if (mm->owner != p)
577        return 0;
578    return 1;
579}
580
581void mm_update_next_owner(struct mm_struct *mm)
582{
583    struct task_struct *c, *g, *p = current;
584
585retry:
586    if (!mm_need_new_owner(mm, p))
587        return;
588
589    read_lock(&tasklist_lock);
590    /*
591     * Search in the children
592     */
593    list_for_each_entry(c, &p->children, sibling) {
594        if (c->mm == mm)
595            goto assign_new_owner;
596    }
597
598    /*
599     * Search in the siblings
600     */
601    list_for_each_entry(c, &p->real_parent->children, sibling) {
602        if (c->mm == mm)
603            goto assign_new_owner;
604    }
605
606    /*
607     * Search through everything else. We should not get
608     * here often
609     */
610    do_each_thread(g, c) {
611        if (c->mm == mm)
612            goto assign_new_owner;
613    } while_each_thread(g, c);
614
615    read_unlock(&tasklist_lock);
616    /*
617     * We found no owner yet mm_users > 1: this implies that we are
618     * most likely racing with swapoff (try_to_unuse()) or /proc or
619     * ptrace or page migration (get_task_mm()). Mark owner as NULL.
620     */
621    mm->owner = NULL;
622    return;
623
624assign_new_owner:
625    BUG_ON(c == p);
626    get_task_struct(c);
627    /*
628     * The task_lock protects c->mm from changing.
629     * We always want mm->owner->mm == mm
630     */
631    task_lock(c);
632    /*
633     * Delay read_unlock() till we have the task_lock()
634     * to ensure that c does not slip away underneath us
635     */
636    read_unlock(&tasklist_lock);
637    if (c->mm != mm) {
638        task_unlock(c);
639        put_task_struct(c);
640        goto retry;
641    }
642    mm->owner = c;
643    task_unlock(c);
644    put_task_struct(c);
645}
646#endif /* CONFIG_MM_OWNER */
647
648/*
649 * Turn us into a lazy TLB process if we
650 * aren't already..
651 */
652static void exit_mm(struct task_struct * tsk)
653{
654    struct mm_struct *mm = tsk->mm;
655    struct core_state *core_state;
656
657    mm_release(tsk, mm);
658    if (!mm)
659        return;
660    /*
661     * Serialize with any possible pending coredump.
662     * We must hold mmap_sem around checking core_state
663     * and clearing tsk->mm. The core-inducing thread
664     * will increment ->nr_threads for each thread in the
665     * group with ->mm != NULL.
666     */
667    down_read(&mm->mmap_sem);
668    core_state = mm->core_state;
669    if (core_state) {
670        struct core_thread self;
671        up_read(&mm->mmap_sem);
672
673        self.task = tsk;
674        self.next = xchg(&core_state->dumper.next, &self);
675        /*
676         * Implies mb(), the result of xchg() must be visible
677         * to core_state->dumper.
678         */
679        if (atomic_dec_and_test(&core_state->nr_threads))
680            complete(&core_state->startup);
681
682        for (;;) {
683            set_task_state(tsk, TASK_UNINTERRUPTIBLE);
684            if (!self.task) /* see coredump_finish() */
685                break;
686            schedule();
687        }
688        __set_task_state(tsk, TASK_RUNNING);
689        down_read(&mm->mmap_sem);
690    }
691    atomic_inc(&mm->mm_count);
692    BUG_ON(mm != tsk->active_mm);
693    /* more a memory barrier than a real lock */
694    task_lock(tsk);
695    tsk->mm = NULL;
696    up_read(&mm->mmap_sem);
697    enter_lazy_tlb(mm, current);
698    /* We don't want this task to be frozen prematurely */
699    clear_freeze_flag(tsk);
700    task_unlock(tsk);
701    mm_update_next_owner(mm);
702    mmput(mm);
703}
704
705/*
706 * When we die, we re-parent all our children.
707 * Try to give them to another thread in our thread
708 * group, and if no such member exists, give it to
709 * the child reaper process (ie "init") in our pid
710 * space.
711 */
712static struct task_struct *find_new_reaper(struct task_struct *father)
713{
714    struct pid_namespace *pid_ns = task_active_pid_ns(father);
715    struct task_struct *thread;
716
717    thread = father;
718    while_each_thread(father, thread) {
719        if (thread->flags & PF_EXITING)
720            continue;
721        if (unlikely(pid_ns->child_reaper == father))
722            pid_ns->child_reaper = thread;
723        return thread;
724    }
725
726    if (unlikely(pid_ns->child_reaper == father)) {
727        write_unlock_irq(&tasklist_lock);
728        if (unlikely(pid_ns == &init_pid_ns))
729            panic("Attempted to kill init!");
730
731        zap_pid_ns_processes(pid_ns);
732        write_lock_irq(&tasklist_lock);
733        /*
734         * We can not clear ->child_reaper or leave it alone.
735         * There may by stealth EXIT_DEAD tasks on ->children,
736         * forget_original_parent() must move them somewhere.
737         */
738        pid_ns->child_reaper = init_pid_ns.child_reaper;
739    }
740
741    return pid_ns->child_reaper;
742}
743
744/*
745* Any that need to be release_task'd are put on the @dead list.
746 */
747static void reparent_leader(struct task_struct *father, struct task_struct *p,
748                struct list_head *dead)
749{
750    list_move_tail(&p->sibling, &p->real_parent->children);
751
752    if (task_detached(p))
753        return;
754    /*
755     * If this is a threaded reparent there is no need to
756     * notify anyone anything has happened.
757     */
758    if (same_thread_group(p->real_parent, father))
759        return;
760
761    /* We don't want people slaying init. */
762    p->exit_signal = SIGCHLD;
763
764    /* If it has exited notify the new parent about this child's death. */
765    if (!task_ptrace(p) &&
766        p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
767        do_notify_parent(p, p->exit_signal);
768        if (task_detached(p)) {
769            p->exit_state = EXIT_DEAD;
770            list_move_tail(&p->sibling, dead);
771        }
772    }
773
774    kill_orphaned_pgrp(p, father);
775}
776
777static void forget_original_parent(struct task_struct *father)
778{
779    struct task_struct *p, *n, *reaper;
780    LIST_HEAD(dead_children);
781
782    exit_ptrace(father);
783
784    write_lock_irq(&tasklist_lock);
785    reaper = find_new_reaper(father);
786
787    list_for_each_entry_safe(p, n, &father->children, sibling) {
788        struct task_struct *t = p;
789        do {
790            t->real_parent = reaper;
791            if (t->parent == father) {
792                BUG_ON(task_ptrace(t));
793                t->parent = t->real_parent;
794            }
795            if (t->pdeath_signal)
796                group_send_sig_info(t->pdeath_signal,
797                            SEND_SIG_NOINFO, t);
798        } while_each_thread(p, t);
799        reparent_leader(father, p, &dead_children);
800    }
801    write_unlock_irq(&tasklist_lock);
802
803    BUG_ON(!list_empty(&father->children));
804
805    list_for_each_entry_safe(p, n, &dead_children, sibling) {
806        list_del_init(&p->sibling);
807        release_task(p);
808    }
809}
810
811/*
812 * Send signals to all our closest relatives so that they know
813 * to properly mourn us..
814 */
815static void exit_notify(struct task_struct *tsk, int group_dead)
816{
817    int signal;
818    void *cookie;
819
820    /*
821     * This does two things:
822     *
823       * A. Make init inherit all the child processes
824     * B. Check to see if any process groups have become orphaned
825     * as a result of our exiting, and if they have any stopped
826     * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
827     */
828    forget_original_parent(tsk);
829    exit_task_namespaces(tsk);
830
831    write_lock_irq(&tasklist_lock);
832    if (group_dead)
833        kill_orphaned_pgrp(tsk->group_leader, NULL);
834
835    /* Let father know we died
836     *
837     * Thread signals are configurable, but you aren't going to use
838     * that to send signals to arbitary processes.
839     * That stops right now.
840     *
841     * If the parent exec id doesn't match the exec id we saved
842     * when we started then we know the parent has changed security
843     * domain.
844     *
845     * If our self_exec id doesn't match our parent_exec_id then
846     * we have changed execution domain as these two values started
847     * the same after a fork.
848     */
849    if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
850        (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
851         tsk->self_exec_id != tsk->parent_exec_id))
852        tsk->exit_signal = SIGCHLD;
853
854    signal = tracehook_notify_death(tsk, &cookie, group_dead);
855    if (signal >= 0)
856        signal = do_notify_parent(tsk, signal);
857
858    tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
859
860    /* mt-exec, de_thread() is waiting for us */
861    if (thread_group_leader(tsk) &&
862        tsk->signal->group_exit_task &&
863        tsk->signal->notify_count < 0)
864        wake_up_process(tsk->signal->group_exit_task);
865
866    write_unlock_irq(&tasklist_lock);
867
868    tracehook_report_death(tsk, signal, cookie, group_dead);
869
870    /* If the process is dead, release it - nobody will wait for it */
871    if (signal == DEATH_REAP)
872        release_task(tsk);
873}
874
875#ifdef CONFIG_DEBUG_STACK_USAGE
876static void check_stack_usage(void)
877{
878    static DEFINE_SPINLOCK(low_water_lock);
879    static int lowest_to_date = THREAD_SIZE;
880    unsigned long free;
881
882    free = stack_not_used(current);
883
884    if (free >= lowest_to_date)
885        return;
886
887    spin_lock(&low_water_lock);
888    if (free < lowest_to_date) {
889        printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
890                "left\n",
891                current->comm, free);
892        lowest_to_date = free;
893    }
894    spin_unlock(&low_water_lock);
895}
896#else
897static inline void check_stack_usage(void) {}
898#endif
899
900NORET_TYPE void do_exit(long code)
901{
902    struct task_struct *tsk = current;
903    int group_dead;
904
905    profile_task_exit(tsk);
906
907    WARN_ON(atomic_read(&tsk->fs_excl));
908
909    if (unlikely(in_interrupt()))
910        panic("Aiee, killing interrupt handler!");
911    if (unlikely(!tsk->pid))
912        panic("Attempted to kill the idle task!");
913
914    tracehook_report_exit(&code);
915
916    validate_creds_for_do_exit(tsk);
917
918    /*
919     * We're taking recursive faults here in do_exit. Safest is to just
920     * leave this task alone and wait for reboot.
921     */
922    if (unlikely(tsk->flags & PF_EXITING)) {
923        printk(KERN_ALERT
924            "Fixing recursive fault but reboot is needed!\n");
925        /*
926         * We can do this unlocked here. The futex code uses
927         * this flag just to verify whether the pi state
928         * cleanup has been done or not. In the worst case it
929         * loops once more. We pretend that the cleanup was
930         * done as there is no way to return. Either the
931         * OWNER_DIED bit is set by now or we push the blocked
932         * task into the wait for ever nirwana as well.
933         */
934        tsk->flags |= PF_EXITPIDONE;
935        set_current_state(TASK_UNINTERRUPTIBLE);
936        schedule();
937    }
938
939    exit_irq_thread();
940
941    exit_signals(tsk); /* sets PF_EXITING */
942    /*
943     * tsk->flags are checked in the futex code to protect against
944     * an exiting task cleaning up the robust pi futexes.
945     */
946    smp_mb();
947    raw_spin_unlock_wait(&tsk->pi_lock);
948
949    if (unlikely(in_atomic()))
950        printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
951                current->comm, task_pid_nr(current),
952                preempt_count());
953
954    acct_update_integrals(tsk);
955    /* sync mm's RSS info before statistics gathering */
956    if (tsk->mm)
957        sync_mm_rss(tsk, tsk->mm);
958    group_dead = atomic_dec_and_test(&tsk->signal->live);
959    if (group_dead) {
960        hrtimer_cancel(&tsk->signal->real_timer);
961        exit_itimers(tsk->signal);
962        if (tsk->mm)
963            setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
964    }
965    acct_collect(code, group_dead);
966    if (group_dead)
967        tty_audit_exit();
968    if (unlikely(tsk->audit_context))
969        audit_free(tsk);
970
971    tsk->exit_code = code;
972    taskstats_exit(tsk, group_dead);
973
974    exit_mm(tsk);
975
976    if (group_dead)
977        acct_process();
978    trace_sched_process_exit(tsk);
979
980    exit_sem(tsk);
981    exit_files(tsk);
982    exit_fs(tsk);
983    check_stack_usage();
984    exit_thread();
985    cgroup_exit(tsk, 1);
986
987    if (group_dead)
988        disassociate_ctty(1);
989
990    module_put(task_thread_info(tsk)->exec_domain->module);
991
992    proc_exit_connector(tsk);
993
994    /*
995     * FIXME: do that only when needed, using sched_exit tracepoint
996     */
997    flush_ptrace_hw_breakpoint(tsk);
998    /*
999     * Flush inherited counters to the parent - before the parent
1000     * gets woken up by child-exit notifications.
1001     */
1002    perf_event_exit_task(tsk);
1003
1004    exit_notify(tsk, group_dead);
1005#ifdef CONFIG_NUMA
1006    mpol_put(tsk->mempolicy);
1007    tsk->mempolicy = NULL;
1008#endif
1009#ifdef CONFIG_FUTEX
1010    if (unlikely(current->pi_state_cache))
1011        kfree(current->pi_state_cache);
1012#endif
1013    /*
1014     * Make sure we are holding no locks:
1015     */
1016    debug_check_no_locks_held(tsk);
1017    /*
1018     * We can do this unlocked here. The futex code uses this flag
1019     * just to verify whether the pi state cleanup has been done
1020     * or not. In the worst case it loops once more.
1021     */
1022    tsk->flags |= PF_EXITPIDONE;
1023
1024    if (tsk->io_context)
1025        exit_io_context(tsk);
1026
1027    if (tsk->splice_pipe)
1028        __free_pipe_info(tsk->splice_pipe);
1029
1030    validate_creds_for_do_exit(tsk);
1031
1032    preempt_disable();
1033    exit_rcu();
1034    /* causes final put_task_struct in finish_task_switch(). */
1035    tsk->state = TASK_DEAD;
1036    schedule();
1037    BUG();
1038    /* Avoid "noreturn function does return". */
1039    for (;;)
1040        cpu_relax(); /* For when BUG is null */
1041}
1042
1043EXPORT_SYMBOL_GPL(do_exit);
1044
1045NORET_TYPE void complete_and_exit(struct completion *comp, long code)
1046{
1047    if (comp)
1048        complete(comp);
1049
1050    do_exit(code);
1051}
1052
1053EXPORT_SYMBOL(complete_and_exit);
1054
1055SYSCALL_DEFINE1(exit, int, error_code)
1056{
1057    do_exit((error_code&0xff)<<8);
1058}
1059
1060/*
1061 * Take down every thread in the group. This is called by fatal signals
1062 * as well as by sys_exit_group (below).
1063 */
1064NORET_TYPE void
1065do_group_exit(int exit_code)
1066{
1067    struct signal_struct *sig = current->signal;
1068
1069    BUG_ON(exit_code & 0x80); /* core dumps don't get here */
1070
1071    if (signal_group_exit(sig))
1072        exit_code = sig->group_exit_code;
1073    else if (!thread_group_empty(current)) {
1074        struct sighand_struct *const sighand = current->sighand;
1075        spin_lock_irq(&sighand->siglock);
1076        if (signal_group_exit(sig))
1077            /* Another thread got here before we took the lock. */
1078            exit_code = sig->group_exit_code;
1079        else {
1080            sig->group_exit_code = exit_code;
1081            sig->flags = SIGNAL_GROUP_EXIT;
1082            zap_other_threads(current);
1083        }
1084        spin_unlock_irq(&sighand->siglock);
1085    }
1086
1087    do_exit(exit_code);
1088    /* NOTREACHED */
1089}
1090
1091/*
1092 * this kills every thread in the thread group. Note that any externally
1093 * wait4()-ing process will get the correct exit code - even if this
1094 * thread is not the thread group leader.
1095 */
1096SYSCALL_DEFINE1(exit_group, int, error_code)
1097{
1098    do_group_exit((error_code & 0xff) << 8);
1099    /* NOTREACHED */
1100    return 0;
1101}
1102
1103struct wait_opts {
1104    enum pid_type wo_type;
1105    int wo_flags;
1106    struct pid *wo_pid;
1107
1108    struct siginfo __user *wo_info;
1109    int __user *wo_stat;
1110    struct rusage __user *wo_rusage;
1111
1112    wait_queue_t child_wait;
1113    int notask_error;
1114};
1115
1116static inline
1117struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1118{
1119    if (type != PIDTYPE_PID)
1120        task = task->group_leader;
1121    return task->pids[type].pid;
1122}
1123
1124static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1125{
1126    return wo->wo_type == PIDTYPE_MAX ||
1127        task_pid_type(p, wo->wo_type) == wo->wo_pid;
1128}
1129
1130static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1131{
1132    if (!eligible_pid(wo, p))
1133        return 0;
1134    /* Wait for all children (clone and not) if __WALL is set;
1135     * otherwise, wait for clone children *only* if __WCLONE is
1136     * set; otherwise, wait for non-clone children *only*. (Note:
1137     * A "clone" child here is one that reports to its parent
1138     * using a signal other than SIGCHLD.) */
1139    if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1140        && !(wo->wo_flags & __WALL))
1141        return 0;
1142
1143    return 1;
1144}
1145
1146static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1147                pid_t pid, uid_t uid, int why, int status)
1148{
1149    struct siginfo __user *infop;
1150    int retval = wo->wo_rusage
1151        ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1152
1153    put_task_struct(p);
1154    infop = wo->wo_info;
1155    if (infop) {
1156        if (!retval)
1157            retval = put_user(SIGCHLD, &infop->si_signo);
1158        if (!retval)
1159            retval = put_user(0, &infop->si_errno);
1160        if (!retval)
1161            retval = put_user((short)why, &infop->si_code);
1162        if (!retval)
1163            retval = put_user(pid, &infop->si_pid);
1164        if (!retval)
1165            retval = put_user(uid, &infop->si_uid);
1166        if (!retval)
1167            retval = put_user(status, &infop->si_status);
1168    }
1169    if (!retval)
1170        retval = pid;
1171    return retval;
1172}
1173
1174/*
1175 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
1176 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1177 * the lock and this task is uninteresting. If we return nonzero, we have
1178 * released the lock and the system call should return.
1179 */
1180static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1181{
1182    unsigned long state;
1183    int retval, status, traced;
1184    pid_t pid = task_pid_vnr(p);
1185    uid_t uid = __task_cred(p)->uid;
1186    struct siginfo __user *infop;
1187
1188    if (!likely(wo->wo_flags & WEXITED))
1189        return 0;
1190
1191    if (unlikely(wo->wo_flags & WNOWAIT)) {
1192        int exit_code = p->exit_code;
1193        int why;
1194
1195        get_task_struct(p);
1196        read_unlock(&tasklist_lock);
1197        if ((exit_code & 0x7f) == 0) {
1198            why = CLD_EXITED;
1199            status = exit_code >> 8;
1200        } else {
1201            why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1202            status = exit_code & 0x7f;
1203        }
1204        return wait_noreap_copyout(wo, p, pid, uid, why, status);
1205    }
1206
1207    /*
1208     * Try to move the task's state to DEAD
1209     * only one thread is allowed to do this:
1210     */
1211    state = xchg(&p->exit_state, EXIT_DEAD);
1212    if (state != EXIT_ZOMBIE) {
1213        BUG_ON(state != EXIT_DEAD);
1214        return 0;
1215    }
1216
1217    traced = ptrace_reparented(p);
1218    /*
1219     * It can be ptraced but not reparented, check
1220     * !task_detached() to filter out sub-threads.
1221     */
1222    if (likely(!traced) && likely(!task_detached(p))) {
1223        struct signal_struct *psig;
1224        struct signal_struct *sig;
1225        unsigned long maxrss;
1226        cputime_t tgutime, tgstime;
1227
1228        /*
1229         * The resource counters for the group leader are in its
1230         * own task_struct. Those for dead threads in the group
1231         * are in its signal_struct, as are those for the child
1232         * processes it has previously reaped. All these
1233         * accumulate in the parent's signal_struct c* fields.
1234         *
1235         * We don't bother to take a lock here to protect these
1236         * p->signal fields, because they are only touched by
1237         * __exit_signal, which runs with tasklist_lock
1238         * write-locked anyway, and so is excluded here. We do
1239         * need to protect the access to parent->signal fields,
1240         * as other threads in the parent group can be right
1241         * here reaping other children at the same time.
1242         *
1243         * We use thread_group_times() to get times for the thread
1244         * group, which consolidates times for all threads in the
1245         * group including the group leader.
1246         */
1247        thread_group_times(p, &tgutime, &tgstime);
1248        spin_lock_irq(&p->real_parent->sighand->siglock);
1249        psig = p->real_parent->signal;
1250        sig = p->signal;
1251        psig->cutime =
1252            cputime_add(psig->cutime,
1253            cputime_add(tgutime,
1254                    sig->cutime));
1255        psig->cstime =
1256            cputime_add(psig->cstime,
1257            cputime_add(tgstime,
1258                    sig->cstime));
1259        psig->cgtime =
1260            cputime_add(psig->cgtime,
1261            cputime_add(p->gtime,
1262            cputime_add(sig->gtime,
1263                    sig->cgtime)));
1264        psig->cmin_flt +=
1265            p->min_flt + sig->min_flt + sig->cmin_flt;
1266        psig->cmaj_flt +=
1267            p->maj_flt + sig->maj_flt + sig->cmaj_flt;
1268        psig->cnvcsw +=
1269            p->nvcsw + sig->nvcsw + sig->cnvcsw;
1270        psig->cnivcsw +=
1271            p->nivcsw + sig->nivcsw + sig->cnivcsw;
1272        psig->cinblock +=
1273            task_io_get_inblock(p) +
1274            sig->inblock + sig->cinblock;
1275        psig->coublock +=
1276            task_io_get_oublock(p) +
1277            sig->oublock + sig->coublock;
1278        maxrss = max(sig->maxrss, sig->cmaxrss);
1279        if (psig->cmaxrss < maxrss)
1280            psig->cmaxrss = maxrss;
1281        task_io_accounting_add(&psig->ioac, &p->ioac);
1282        task_io_accounting_add(&psig->ioac, &sig->ioac);
1283        spin_unlock_irq(&p->real_parent->sighand->siglock);
1284    }
1285
1286    /*
1287     * Now we are sure this task is interesting, and no other
1288     * thread can reap it because we set its state to EXIT_DEAD.
1289     */
1290    read_unlock(&tasklist_lock);
1291
1292    retval = wo->wo_rusage
1293        ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1294    status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1295        ? p->signal->group_exit_code : p->exit_code;
1296    if (!retval && wo->wo_stat)
1297        retval = put_user(status, wo->wo_stat);
1298
1299    infop = wo->wo_info;
1300    if (!retval && infop)
1301        retval = put_user(SIGCHLD, &infop->si_signo);
1302    if (!retval && infop)
1303        retval = put_user(0, &infop->si_errno);
1304    if (!retval && infop) {
1305        int why;
1306
1307        if ((status & 0x7f) == 0) {
1308            why = CLD_EXITED;
1309            status >>= 8;
1310        } else {
1311            why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
1312            status &= 0x7f;
1313        }
1314        retval = put_user((short)why, &infop->si_code);
1315        if (!retval)
1316            retval = put_user(status, &infop->si_status);
1317    }
1318    if (!retval && infop)
1319        retval = put_user(pid, &infop->si_pid);
1320    if (!retval && infop)
1321        retval = put_user(uid, &infop->si_uid);
1322    if (!retval)
1323        retval = pid;
1324
1325    if (traced) {
1326        write_lock_irq(&tasklist_lock);
1327        /* We dropped tasklist, ptracer could die and untrace */
1328        ptrace_unlink(p);
1329        /*
1330         * If this is not a detached task, notify the parent.
1331         * If it's still not detached after that, don't release
1332         * it now.
1333         */
1334        if (!task_detached(p)) {
1335            do_notify_parent(p, p->exit_signal);
1336            if (!task_detached(p)) {
1337                p->exit_state = EXIT_ZOMBIE;
1338                p = NULL;
1339            }
1340        }
1341        write_unlock_irq(&tasklist_lock);
1342    }
1343    if (p != NULL)
1344        release_task(p);
1345
1346    return retval;
1347}
1348
1349static int *task_stopped_code(struct task_struct *p, bool ptrace)
1350{
1351    if (ptrace) {
1352        if (task_is_stopped_or_traced(p))
1353            return &p->exit_code;
1354    } else {
1355        if (p->signal->flags & SIGNAL_STOP_STOPPED)
1356            return &p->signal->group_exit_code;
1357    }
1358    return NULL;
1359}
1360
1361/*
1362 * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
1363 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1364 * the lock and this task is uninteresting. If we return nonzero, we have
1365 * released the lock and the system call should return.
1366 */
1367static int wait_task_stopped(struct wait_opts *wo,
1368                int ptrace, struct task_struct *p)
1369{
1370    struct siginfo __user *infop;
1371    int retval, exit_code, *p_code, why;
1372    uid_t uid = 0; /* unneeded, required by compiler */
1373    pid_t pid;
1374
1375    /*
1376     * Traditionally we see ptrace'd stopped tasks regardless of options.
1377     */
1378    if (!ptrace && !(wo->wo_flags & WUNTRACED))
1379        return 0;
1380
1381    exit_code = 0;
1382    spin_lock_irq(&p->sighand->siglock);
1383
1384    p_code = task_stopped_code(p, ptrace);
1385    if (unlikely(!p_code))
1386        goto unlock_sig;
1387
1388    exit_code = *p_code;
1389    if (!exit_code)
1390        goto unlock_sig;
1391
1392    if (!unlikely(wo->wo_flags & WNOWAIT))
1393        *p_code = 0;
1394
1395    /* don't need the RCU readlock here as we're holding a spinlock */
1396    uid = __task_cred(p)->uid;
1397unlock_sig:
1398    spin_unlock_irq(&p->sighand->siglock);
1399    if (!exit_code)
1400        return 0;
1401
1402    /*
1403     * Now we are pretty sure this task is interesting.
1404     * Make sure it doesn't get reaped out from under us while we
1405     * give up the lock and then examine it below. We don't want to
1406     * keep holding onto the tasklist_lock while we call getrusage and
1407     * possibly take page faults for user memory.
1408     */
1409    get_task_struct(p);
1410    pid = task_pid_vnr(p);
1411    why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1412    read_unlock(&tasklist_lock);
1413
1414    if (unlikely(wo->wo_flags & WNOWAIT))
1415        return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1416
1417    retval = wo->wo_rusage
1418        ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1419    if (!retval && wo->wo_stat)
1420        retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1421
1422    infop = wo->wo_info;
1423    if (!retval && infop)
1424        retval = put_user(SIGCHLD, &infop->si_signo);
1425    if (!retval && infop)
1426        retval = put_user(0, &infop->si_errno);
1427    if (!retval && infop)
1428        retval = put_user((short)why, &infop->si_code);
1429    if (!retval && infop)
1430        retval = put_user(exit_code, &infop->si_status);
1431    if (!retval && infop)
1432        retval = put_user(pid, &infop->si_pid);
1433    if (!retval && infop)
1434        retval = put_user(uid, &infop->si_uid);
1435    if (!retval)
1436        retval = pid;
1437    put_task_struct(p);
1438
1439    BUG_ON(!retval);
1440    return retval;
1441}
1442
1443/*
1444 * Handle do_wait work for one task in a live, non-stopped state.
1445 * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
1446 * the lock and this task is uninteresting. If we return nonzero, we have
1447 * released the lock and the system call should return.
1448 */
1449static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1450{
1451    int retval;
1452    pid_t pid;
1453    uid_t uid;
1454
1455    if (!unlikely(wo->wo_flags & WCONTINUED))
1456        return 0;
1457
1458    if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1459        return 0;
1460
1461    spin_lock_irq(&p->sighand->siglock);
1462    /* Re-check with the lock held. */
1463    if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
1464        spin_unlock_irq(&p->sighand->siglock);
1465        return 0;
1466    }
1467    if (!unlikely(wo->wo_flags & WNOWAIT))
1468        p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1469    uid = __task_cred(p)->uid;
1470    spin_unlock_irq(&p->sighand->siglock);
1471
1472    pid = task_pid_vnr(p);
1473    get_task_struct(p);
1474    read_unlock(&tasklist_lock);
1475
1476    if (!wo->wo_info) {
1477        retval = wo->wo_rusage
1478            ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1479        put_task_struct(p);
1480        if (!retval && wo->wo_stat)
1481            retval = put_user(0xffff, wo->wo_stat);
1482        if (!retval)
1483            retval = pid;
1484    } else {
1485        retval = wait_noreap_copyout(wo, p, pid, uid,
1486                         CLD_CONTINUED, SIGCONT);
1487        BUG_ON(retval == 0);
1488    }
1489
1490    return retval;
1491}
1492
1493/*
1494 * Consider @p for a wait by @parent.
1495 *
1496 * -ECHILD should be in ->notask_error before the first call.
1497 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1498 * Returns zero if the search for a child should continue;
1499 * then ->notask_error is 0 if @p is an eligible child,
1500 * or another error from security_task_wait(), or still -ECHILD.
1501 */
1502static int wait_consider_task(struct wait_opts *wo, int ptrace,
1503                struct task_struct *p)
1504{
1505    int ret = eligible_child(wo, p);
1506    if (!ret)
1507        return ret;
1508
1509    ret = security_task_wait(p);
1510    if (unlikely(ret < 0)) {
1511        /*
1512         * If we have not yet seen any eligible child,
1513         * then let this error code replace -ECHILD.
1514         * A permission error will give the user a clue
1515         * to look for security policy problems, rather
1516         * than for mysterious wait bugs.
1517         */
1518        if (wo->notask_error)
1519            wo->notask_error = ret;
1520        return 0;
1521    }
1522
1523    if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1524        /*
1525         * This child is hidden by ptrace.
1526         * We aren't allowed to see it now, but eventually we will.
1527         */
1528        wo->notask_error = 0;
1529        return 0;
1530    }
1531
1532    if (p->exit_state == EXIT_DEAD)
1533        return 0;
1534
1535    /*
1536     * We don't reap group leaders with subthreads.
1537     */
1538    if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1539        return wait_task_zombie(wo, p);
1540
1541    /*
1542     * It's stopped or running now, so it might
1543     * later continue, exit, or stop again.
1544     */
1545    wo->notask_error = 0;
1546
1547    if (task_stopped_code(p, ptrace))
1548        return wait_task_stopped(wo, ptrace, p);
1549
1550    return wait_task_continued(wo, p);
1551}
1552
1553/*
1554 * Do the work of do_wait() for one thread in the group, @tsk.
1555 *
1556 * -ECHILD should be in ->notask_error before the first call.
1557 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1558 * Returns zero if the search for a child should continue; then
1559 * ->notask_error is 0 if there were any eligible children,
1560 * or another error from security_task_wait(), or still -ECHILD.
1561 */
1562static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1563{
1564    struct task_struct *p;
1565
1566    list_for_each_entry(p, &tsk->children, sibling) {
1567        int ret = wait_consider_task(wo, 0, p);
1568        if (ret)
1569            return ret;
1570    }
1571
1572    return 0;
1573}
1574
1575static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1576{
1577    struct task_struct *p;
1578
1579    list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1580        int ret = wait_consider_task(wo, 1, p);
1581        if (ret)
1582            return ret;
1583    }
1584
1585    return 0;
1586}
1587
1588static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1589                int sync, void *key)
1590{
1591    struct wait_opts *wo = container_of(wait, struct wait_opts,
1592                        child_wait);
1593    struct task_struct *p = key;
1594
1595    if (!eligible_pid(wo, p))
1596        return 0;
1597
1598    if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1599        return 0;
1600
1601    return default_wake_function(wait, mode, sync, key);
1602}
1603
1604void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1605{
1606    __wake_up_sync_key(&parent->signal->wait_chldexit,
1607                TASK_INTERRUPTIBLE, 1, p);
1608}
1609
1610static long do_wait(struct wait_opts *wo)
1611{
1612    struct task_struct *tsk;
1613    int retval;
1614
1615    trace_sched_process_wait(wo->wo_pid);
1616
1617    init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1618    wo->child_wait.private = current;
1619    add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1620repeat:
1621    /*
1622     * If there is nothing that can match our critiera just get out.
1623     * We will clear ->notask_error to zero if we see any child that
1624     * might later match our criteria, even if we are not able to reap
1625     * it yet.
1626     */
1627    wo->notask_error = -ECHILD;
1628    if ((wo->wo_type < PIDTYPE_MAX) &&
1629       (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1630        goto notask;
1631
1632    set_current_state(TASK_INTERRUPTIBLE);
1633    read_lock(&tasklist_lock);
1634    tsk = current;
1635    do {
1636        retval = do_wait_thread(wo, tsk);
1637        if (retval)
1638            goto end;
1639
1640        retval = ptrace_do_wait(wo, tsk);
1641        if (retval)
1642            goto end;
1643
1644        if (wo->wo_flags & __WNOTHREAD)
1645            break;
1646    } while_each_thread(current, tsk);
1647    read_unlock(&tasklist_lock);
1648
1649notask:
1650    retval = wo->notask_error;
1651    if (!retval && !(wo->wo_flags & WNOHANG)) {
1652        retval = -ERESTARTSYS;
1653        if (!signal_pending(current)) {
1654            schedule();
1655            goto repeat;
1656        }
1657    }
1658end:
1659    __set_current_state(TASK_RUNNING);
1660    remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1661    return retval;
1662}
1663
1664SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1665        infop, int, options, struct rusage __user *, ru)
1666{
1667    struct wait_opts wo;
1668    struct pid *pid = NULL;
1669    enum pid_type type;
1670    long ret;
1671
1672    if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
1673        return -EINVAL;
1674    if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
1675        return -EINVAL;
1676
1677    switch (which) {
1678    case P_ALL:
1679        type = PIDTYPE_MAX;
1680        break;
1681    case P_PID:
1682        type = PIDTYPE_PID;
1683        if (upid <= 0)
1684            return -EINVAL;
1685        break;
1686    case P_PGID:
1687        type = PIDTYPE_PGID;
1688        if (upid <= 0)
1689            return -EINVAL;
1690        break;
1691    default:
1692        return -EINVAL;
1693    }
1694
1695    if (type < PIDTYPE_MAX)
1696        pid = find_get_pid(upid);
1697
1698    wo.wo_type = type;
1699    wo.wo_pid = pid;
1700    wo.wo_flags = options;
1701    wo.wo_info = infop;
1702    wo.wo_stat = NULL;
1703    wo.wo_rusage = ru;
1704    ret = do_wait(&wo);
1705
1706    if (ret > 0) {
1707        ret = 0;
1708    } else if (infop) {
1709        /*
1710         * For a WNOHANG return, clear out all the fields
1711         * we would set so the user can easily tell the
1712         * difference.
1713         */
1714        if (!ret)
1715            ret = put_user(0, &infop->si_signo);
1716        if (!ret)
1717            ret = put_user(0, &infop->si_errno);
1718        if (!ret)
1719            ret = put_user(0, &infop->si_code);
1720        if (!ret)
1721            ret = put_user(0, &infop->si_pid);
1722        if (!ret)
1723            ret = put_user(0, &infop->si_uid);
1724        if (!ret)
1725            ret = put_user(0, &infop->si_status);
1726    }
1727
1728    put_pid(pid);
1729
1730    /* avoid REGPARM breakage on x86: */
1731    asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1732    return ret;
1733}
1734
1735SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1736        int, options, struct rusage __user *, ru)
1737{
1738    struct wait_opts wo;
1739    struct pid *pid = NULL;
1740    enum pid_type type;
1741    long ret;
1742
1743    if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
1744            __WNOTHREAD|__WCLONE|__WALL))
1745        return -EINVAL;
1746
1747    if (upid == -1)
1748        type = PIDTYPE_MAX;
1749    else if (upid < 0) {
1750        type = PIDTYPE_PGID;
1751        pid = find_get_pid(-upid);
1752    } else if (upid == 0) {
1753        type = PIDTYPE_PGID;
1754        pid = get_task_pid(current, PIDTYPE_PGID);
1755    } else /* upid > 0 */ {
1756        type = PIDTYPE_PID;
1757        pid = find_get_pid(upid);
1758    }
1759
1760    wo.wo_type = type;
1761    wo.wo_pid = pid;
1762    wo.wo_flags = options | WEXITED;
1763    wo.wo_info = NULL;
1764    wo.wo_stat = stat_addr;
1765    wo.wo_rusage = ru;
1766    ret = do_wait(&wo);
1767    put_pid(pid);
1768
1769    /* avoid REGPARM breakage on x86: */
1770    asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1771    return ret;
1772}
1773
1774#ifdef __ARCH_WANT_SYS_WAITPID
1775
1776/*
1777 * sys_waitpid() remains for compatibility. waitpid() should be
1778 * implemented by calling sys_wait4() from libc.a.
1779 */
1780SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1781{
1782    return sys_wait4(pid, stat_addr, options, NULL);
1783}
1784
1785#endif
1786

Archive Download this file



interactive