Root/kernel/pid_namespace.c

1/*
2 * Pid namespaces
3 *
4 * Authors:
5 * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
6 * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
7 * Many thanks to Oleg Nesterov for comments and help
8 *
9 */
10
11#include <linux/pid.h>
12#include <linux/pid_namespace.h>
13#include <linux/syscalls.h>
14#include <linux/err.h>
15#include <linux/acct.h>
16#include <linux/slab.h>
17#include <linux/proc_fs.h>
18#include <linux/reboot.h>
19
20#define BITS_PER_PAGE (PAGE_SIZE*8)
21
22struct pid_cache {
23    int nr_ids;
24    char name[16];
25    struct kmem_cache *cachep;
26    struct list_head list;
27};
28
29static LIST_HEAD(pid_caches_lh);
30static DEFINE_MUTEX(pid_caches_mutex);
31static struct kmem_cache *pid_ns_cachep;
32
33/*
34 * creates the kmem cache to allocate pids from.
35 * @nr_ids: the number of numerical ids this pid will have to carry
36 */
37
38static struct kmem_cache *create_pid_cachep(int nr_ids)
39{
40    struct pid_cache *pcache;
41    struct kmem_cache *cachep;
42
43    mutex_lock(&pid_caches_mutex);
44    list_for_each_entry(pcache, &pid_caches_lh, list)
45        if (pcache->nr_ids == nr_ids)
46            goto out;
47
48    pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
49    if (pcache == NULL)
50        goto err_alloc;
51
52    snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
53    cachep = kmem_cache_create(pcache->name,
54            sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
55            0, SLAB_HWCACHE_ALIGN, NULL);
56    if (cachep == NULL)
57        goto err_cachep;
58
59    pcache->nr_ids = nr_ids;
60    pcache->cachep = cachep;
61    list_add(&pcache->list, &pid_caches_lh);
62out:
63    mutex_unlock(&pid_caches_mutex);
64    return pcache->cachep;
65
66err_cachep:
67    kfree(pcache);
68err_alloc:
69    mutex_unlock(&pid_caches_mutex);
70    return NULL;
71}
72
73static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
74{
75    struct pid_namespace *ns;
76    unsigned int level = parent_pid_ns->level + 1;
77    int i, err = -ENOMEM;
78
79    ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
80    if (ns == NULL)
81        goto out;
82
83    ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
84    if (!ns->pidmap[0].page)
85        goto out_free;
86
87    ns->pid_cachep = create_pid_cachep(level + 1);
88    if (ns->pid_cachep == NULL)
89        goto out_free_map;
90
91    kref_init(&ns->kref);
92    ns->level = level;
93    ns->parent = get_pid_ns(parent_pid_ns);
94
95    set_bit(0, ns->pidmap[0].page);
96    atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
97
98    for (i = 1; i < PIDMAP_ENTRIES; i++)
99        atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
100
101    err = pid_ns_prepare_proc(ns);
102    if (err)
103        goto out_put_parent_pid_ns;
104
105    return ns;
106
107out_put_parent_pid_ns:
108    put_pid_ns(parent_pid_ns);
109out_free_map:
110    kfree(ns->pidmap[0].page);
111out_free:
112    kmem_cache_free(pid_ns_cachep, ns);
113out:
114    return ERR_PTR(err);
115}
116
117static void destroy_pid_namespace(struct pid_namespace *ns)
118{
119    int i;
120
121    for (i = 0; i < PIDMAP_ENTRIES; i++)
122        kfree(ns->pidmap[i].page);
123    kmem_cache_free(pid_ns_cachep, ns);
124}
125
126struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
127{
128    if (!(flags & CLONE_NEWPID))
129        return get_pid_ns(old_ns);
130    if (flags & (CLONE_THREAD|CLONE_PARENT))
131        return ERR_PTR(-EINVAL);
132    return create_pid_namespace(old_ns);
133}
134
135void free_pid_ns(struct kref *kref)
136{
137    struct pid_namespace *ns, *parent;
138
139    ns = container_of(kref, struct pid_namespace, kref);
140
141    parent = ns->parent;
142    destroy_pid_namespace(ns);
143
144    if (parent != NULL)
145        put_pid_ns(parent);
146}
147
148void zap_pid_ns_processes(struct pid_namespace *pid_ns)
149{
150    int nr;
151    int rc;
152    struct task_struct *task, *me = current;
153
154    /* Ignore SIGCHLD causing any terminated children to autoreap */
155    spin_lock_irq(&me->sighand->siglock);
156    me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
157    spin_unlock_irq(&me->sighand->siglock);
158
159    /*
160     * The last thread in the cgroup-init thread group is terminating.
161     * Find remaining pid_ts in the namespace, signal and wait for them
162     * to exit.
163     *
164     * Note: This signals each threads in the namespace - even those that
165     * belong to the same thread group, To avoid this, we would have
166     * to walk the entire tasklist looking a processes in this
167     * namespace, but that could be unnecessarily expensive if the
168     * pid namespace has just a few processes. Or we need to
169     * maintain a tasklist for each pid namespace.
170     *
171     */
172    read_lock(&tasklist_lock);
173    nr = next_pidmap(pid_ns, 1);
174    while (nr > 0) {
175        rcu_read_lock();
176
177        task = pid_task(find_vpid(nr), PIDTYPE_PID);
178        if (task && !__fatal_signal_pending(task))
179            send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
180
181        rcu_read_unlock();
182
183        nr = next_pidmap(pid_ns, nr);
184    }
185    read_unlock(&tasklist_lock);
186
187    /* Firstly reap the EXIT_ZOMBIE children we may have. */
188    do {
189        clear_thread_flag(TIF_SIGPENDING);
190        rc = sys_wait4(-1, NULL, __WALL, NULL);
191    } while (rc != -ECHILD);
192
193    /*
194     * sys_wait4() above can't reap the TASK_DEAD children.
195     * Make sure they all go away, see __unhash_process().
196     */
197    for (;;) {
198        bool need_wait = false;
199
200        read_lock(&tasklist_lock);
201        if (!list_empty(&current->children)) {
202            __set_current_state(TASK_UNINTERRUPTIBLE);
203            need_wait = true;
204        }
205        read_unlock(&tasklist_lock);
206
207        if (!need_wait)
208            break;
209        schedule();
210    }
211
212    if (pid_ns->reboot)
213        current->signal->group_exit_code = pid_ns->reboot;
214
215    acct_exit_ns(pid_ns);
216    return;
217}
218
219#ifdef CONFIG_CHECKPOINT_RESTORE
220static int pid_ns_ctl_handler(struct ctl_table *table, int write,
221        void __user *buffer, size_t *lenp, loff_t *ppos)
222{
223    struct ctl_table tmp = *table;
224
225    if (write && !capable(CAP_SYS_ADMIN))
226        return -EPERM;
227
228    /*
229     * Writing directly to ns' last_pid field is OK, since this field
230     * is volatile in a living namespace anyway and a code writing to
231     * it should synchronize its usage with external means.
232     */
233
234    tmp.data = &current->nsproxy->pid_ns->last_pid;
235    return proc_dointvec(&tmp, write, buffer, lenp, ppos);
236}
237
238static struct ctl_table pid_ns_ctl_table[] = {
239    {
240        .procname = "ns_last_pid",
241        .maxlen = sizeof(int),
242        .mode = 0666, /* permissions are checked in the handler */
243        .proc_handler = pid_ns_ctl_handler,
244    },
245    { }
246};
247static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
248#endif /* CONFIG_CHECKPOINT_RESTORE */
249
250int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
251{
252    if (pid_ns == &init_pid_ns)
253        return 0;
254
255    switch (cmd) {
256    case LINUX_REBOOT_CMD_RESTART2:
257    case LINUX_REBOOT_CMD_RESTART:
258        pid_ns->reboot = SIGHUP;
259        break;
260
261    case LINUX_REBOOT_CMD_POWER_OFF:
262    case LINUX_REBOOT_CMD_HALT:
263        pid_ns->reboot = SIGINT;
264        break;
265    default:
266        return -EINVAL;
267    }
268
269    read_lock(&tasklist_lock);
270    force_sig(SIGKILL, pid_ns->child_reaper);
271    read_unlock(&tasklist_lock);
272
273    do_exit(0);
274
275    /* Not reached */
276    return 0;
277}
278
279static __init int pid_namespaces_init(void)
280{
281    pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
282
283#ifdef CONFIG_CHECKPOINT_RESTORE
284    register_sysctl_paths(kern_path, pid_ns_ctl_table);
285#endif
286    return 0;
287}
288
289__initcall(pid_namespaces_init);
290

Archive Download this file



interactive