Root/kernel/taskstats.c

1/*
2 * taskstats.c - Export per-task statistics to userland
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 * (C) Balbir Singh, IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h>
21#include <linux/tsacct_kern.h>
22#include <linux/delayacct.h>
23#include <linux/cpumask.h>
24#include <linux/percpu.h>
25#include <linux/slab.h>
26#include <linux/cgroupstats.h>
27#include <linux/cgroup.h>
28#include <linux/fs.h>
29#include <linux/file.h>
30#include <net/genetlink.h>
31#include <asm/atomic.h>
32
33/*
34 * Maximum length of a cpumask that can be specified in
35 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
36 */
37#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
38
39static DEFINE_PER_CPU(__u32, taskstats_seqnum);
40static int family_registered;
41struct kmem_cache *taskstats_cache;
42
43static struct genl_family family = {
44    .id = GENL_ID_GENERATE,
45    .name = TASKSTATS_GENL_NAME,
46    .version = TASKSTATS_GENL_VERSION,
47    .maxattr = TASKSTATS_CMD_ATTR_MAX,
48};
49
50static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
51    [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52    [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53    [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54    [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55
56static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
57    [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
58};
59
60struct listener {
61    struct list_head list;
62    pid_t pid;
63    char valid;
64};
65
66struct listener_list {
67    struct rw_semaphore sem;
68    struct list_head list;
69};
70static DEFINE_PER_CPU(struct listener_list, listener_array);
71
72enum actions {
73    REGISTER,
74    DEREGISTER,
75    CPU_DONT_CARE
76};
77
78static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
79                size_t size)
80{
81    struct sk_buff *skb;
82    void *reply;
83
84    /*
85     * If new attributes are added, please revisit this allocation
86     */
87    skb = genlmsg_new(size, GFP_KERNEL);
88    if (!skb)
89        return -ENOMEM;
90
91    if (!info) {
92        int seq = get_cpu_var(taskstats_seqnum)++;
93        put_cpu_var(taskstats_seqnum);
94
95        reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
96    } else
97        reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
98    if (reply == NULL) {
99        nlmsg_free(skb);
100        return -EINVAL;
101    }
102
103    *skbp = skb;
104    return 0;
105}
106
107/*
108 * Send taskstats data in @skb to listener with nl_pid @pid
109 */
110static int send_reply(struct sk_buff *skb, struct genl_info *info)
111{
112    struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
113    void *reply = genlmsg_data(genlhdr);
114    int rc;
115
116    rc = genlmsg_end(skb, reply);
117    if (rc < 0) {
118        nlmsg_free(skb);
119        return rc;
120    }
121
122    return genlmsg_reply(skb, info);
123}
124
125/*
126 * Send taskstats data in @skb to listeners registered for @cpu's exit data
127 */
128static void send_cpu_listeners(struct sk_buff *skb,
129                    struct listener_list *listeners)
130{
131    struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
132    struct listener *s, *tmp;
133    struct sk_buff *skb_next, *skb_cur = skb;
134    void *reply = genlmsg_data(genlhdr);
135    int rc, delcount = 0;
136
137    rc = genlmsg_end(skb, reply);
138    if (rc < 0) {
139        nlmsg_free(skb);
140        return;
141    }
142
143    rc = 0;
144    down_read(&listeners->sem);
145    list_for_each_entry(s, &listeners->list, list) {
146        skb_next = NULL;
147        if (!list_is_last(&s->list, &listeners->list)) {
148            skb_next = skb_clone(skb_cur, GFP_KERNEL);
149            if (!skb_next)
150                break;
151        }
152        rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
153        if (rc == -ECONNREFUSED) {
154            s->valid = 0;
155            delcount++;
156        }
157        skb_cur = skb_next;
158    }
159    up_read(&listeners->sem);
160
161    if (skb_cur)
162        nlmsg_free(skb_cur);
163
164    if (!delcount)
165        return;
166
167    /* Delete invalidated entries */
168    down_write(&listeners->sem);
169    list_for_each_entry_safe(s, tmp, &listeners->list, list) {
170        if (!s->valid) {
171            list_del(&s->list);
172            kfree(s);
173        }
174    }
175    up_write(&listeners->sem);
176}
177
178static int fill_pid(pid_t pid, struct task_struct *tsk,
179        struct taskstats *stats)
180{
181    int rc = 0;
182
183    if (!tsk) {
184        rcu_read_lock();
185        tsk = find_task_by_vpid(pid);
186        if (tsk)
187            get_task_struct(tsk);
188        rcu_read_unlock();
189        if (!tsk)
190            return -ESRCH;
191    } else
192        get_task_struct(tsk);
193
194    memset(stats, 0, sizeof(*stats));
195    /*
196     * Each accounting subsystem adds calls to its functions to
197     * fill in relevant parts of struct taskstsats as follows
198     *
199     * per-task-foo(stats, tsk);
200     */
201
202    delayacct_add_tsk(stats, tsk);
203
204    /* fill in basic acct fields */
205    stats->version = TASKSTATS_VERSION;
206    stats->nvcsw = tsk->nvcsw;
207    stats->nivcsw = tsk->nivcsw;
208    bacct_add_tsk(stats, tsk);
209
210    /* fill in extended acct fields */
211    xacct_add_tsk(stats, tsk);
212
213    /* Define err: label here if needed */
214    put_task_struct(tsk);
215    return rc;
216
217}
218
219static int fill_tgid(pid_t tgid, struct task_struct *first,
220        struct taskstats *stats)
221{
222    struct task_struct *tsk;
223    unsigned long flags;
224    int rc = -ESRCH;
225
226    /*
227     * Add additional stats from live tasks except zombie thread group
228     * leaders who are already counted with the dead tasks
229     */
230    rcu_read_lock();
231    if (!first)
232        first = find_task_by_vpid(tgid);
233
234    if (!first || !lock_task_sighand(first, &flags))
235        goto out;
236
237    if (first->signal->stats)
238        memcpy(stats, first->signal->stats, sizeof(*stats));
239    else
240        memset(stats, 0, sizeof(*stats));
241
242    tsk = first;
243    do {
244        if (tsk->exit_state)
245            continue;
246        /*
247         * Accounting subsystem can call its functions here to
248         * fill in relevant parts of struct taskstsats as follows
249         *
250         * per-task-foo(stats, tsk);
251         */
252        delayacct_add_tsk(stats, tsk);
253
254        stats->nvcsw += tsk->nvcsw;
255        stats->nivcsw += tsk->nivcsw;
256    } while_each_thread(first, tsk);
257
258    unlock_task_sighand(first, &flags);
259    rc = 0;
260out:
261    rcu_read_unlock();
262
263    stats->version = TASKSTATS_VERSION;
264    /*
265     * Accounting subsystems can also add calls here to modify
266     * fields of taskstats.
267     */
268    return rc;
269}
270
271
272static void fill_tgid_exit(struct task_struct *tsk)
273{
274    unsigned long flags;
275
276    spin_lock_irqsave(&tsk->sighand->siglock, flags);
277    if (!tsk->signal->stats)
278        goto ret;
279
280    /*
281     * Each accounting subsystem calls its functions here to
282     * accumalate its per-task stats for tsk, into the per-tgid structure
283     *
284     * per-task-foo(tsk->signal->stats, tsk);
285     */
286    delayacct_add_tsk(tsk->signal->stats, tsk);
287ret:
288    spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
289    return;
290}
291
292static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
293{
294    struct listener_list *listeners;
295    struct listener *s, *tmp;
296    unsigned int cpu;
297
298    if (!cpumask_subset(mask, cpu_possible_mask))
299        return -EINVAL;
300
301    if (isadd == REGISTER) {
302        for_each_cpu(cpu, mask) {
303            s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
304                     cpu_to_node(cpu));
305            if (!s)
306                goto cleanup;
307            s->pid = pid;
308            INIT_LIST_HEAD(&s->list);
309            s->valid = 1;
310
311            listeners = &per_cpu(listener_array, cpu);
312            down_write(&listeners->sem);
313            list_add(&s->list, &listeners->list);
314            up_write(&listeners->sem);
315        }
316        return 0;
317    }
318
319    /* Deregister or cleanup */
320cleanup:
321    for_each_cpu(cpu, mask) {
322        listeners = &per_cpu(listener_array, cpu);
323        down_write(&listeners->sem);
324        list_for_each_entry_safe(s, tmp, &listeners->list, list) {
325            if (s->pid == pid) {
326                list_del(&s->list);
327                kfree(s);
328                break;
329            }
330        }
331        up_write(&listeners->sem);
332    }
333    return 0;
334}
335
336static int parse(struct nlattr *na, struct cpumask *mask)
337{
338    char *data;
339    int len;
340    int ret;
341
342    if (na == NULL)
343        return 1;
344    len = nla_len(na);
345    if (len > TASKSTATS_CPUMASK_MAXLEN)
346        return -E2BIG;
347    if (len < 1)
348        return -EINVAL;
349    data = kmalloc(len, GFP_KERNEL);
350    if (!data)
351        return -ENOMEM;
352    nla_strlcpy(data, na, len);
353    ret = cpulist_parse(data, mask);
354    kfree(data);
355    return ret;
356}
357
358static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
359{
360    struct nlattr *na, *ret;
361    int aggr;
362
363    aggr = (type == TASKSTATS_TYPE_PID)
364            ? TASKSTATS_TYPE_AGGR_PID
365            : TASKSTATS_TYPE_AGGR_TGID;
366
367    na = nla_nest_start(skb, aggr);
368    if (!na)
369        goto err;
370    if (nla_put(skb, type, sizeof(pid), &pid) < 0)
371        goto err;
372    ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
373    if (!ret)
374        goto err;
375    nla_nest_end(skb, na);
376
377    return nla_data(ret);
378err:
379    return NULL;
380}
381
382static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
383{
384    int rc = 0;
385    struct sk_buff *rep_skb;
386    struct cgroupstats *stats;
387    struct nlattr *na;
388    size_t size;
389    u32 fd;
390    struct file *file;
391    int fput_needed;
392
393    na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
394    if (!na)
395        return -EINVAL;
396
397    fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
398    file = fget_light(fd, &fput_needed);
399    if (!file)
400        return 0;
401
402    size = nla_total_size(sizeof(struct cgroupstats));
403
404    rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
405                size);
406    if (rc < 0)
407        goto err;
408
409    na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
410                sizeof(struct cgroupstats));
411    stats = nla_data(na);
412    memset(stats, 0, sizeof(*stats));
413
414    rc = cgroupstats_build(stats, file->f_dentry);
415    if (rc < 0) {
416        nlmsg_free(rep_skb);
417        goto err;
418    }
419
420    rc = send_reply(rep_skb, info);
421
422err:
423    fput_light(file, fput_needed);
424    return rc;
425}
426
427static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
428{
429    int rc;
430    struct sk_buff *rep_skb;
431    struct taskstats *stats;
432    size_t size;
433    cpumask_var_t mask;
434
435    if (!alloc_cpumask_var(&mask, GFP_KERNEL))
436        return -ENOMEM;
437
438    rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
439    if (rc < 0)
440        goto free_return_rc;
441    if (rc == 0) {
442        rc = add_del_listener(info->snd_pid, mask, REGISTER);
443        goto free_return_rc;
444    }
445
446    rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
447    if (rc < 0)
448        goto free_return_rc;
449    if (rc == 0) {
450        rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
451free_return_rc:
452        free_cpumask_var(mask);
453        return rc;
454    }
455    free_cpumask_var(mask);
456
457    /*
458     * Size includes space for nested attributes
459     */
460    size = nla_total_size(sizeof(u32)) +
461        nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
462
463    rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
464    if (rc < 0)
465        return rc;
466
467    rc = -EINVAL;
468    if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
469        u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
470        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
471        if (!stats)
472            goto err;
473
474        rc = fill_pid(pid, NULL, stats);
475        if (rc < 0)
476            goto err;
477    } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
478        u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
479        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
480        if (!stats)
481            goto err;
482
483        rc = fill_tgid(tgid, NULL, stats);
484        if (rc < 0)
485            goto err;
486    } else
487        goto err;
488
489    return send_reply(rep_skb, info);
490err:
491    nlmsg_free(rep_skb);
492    return rc;
493}
494
495static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
496{
497    struct signal_struct *sig = tsk->signal;
498    struct taskstats *stats;
499
500    if (sig->stats || thread_group_empty(tsk))
501        goto ret;
502
503    /* No problem if kmem_cache_zalloc() fails */
504    stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
505
506    spin_lock_irq(&tsk->sighand->siglock);
507    if (!sig->stats) {
508        sig->stats = stats;
509        stats = NULL;
510    }
511    spin_unlock_irq(&tsk->sighand->siglock);
512
513    if (stats)
514        kmem_cache_free(taskstats_cache, stats);
515ret:
516    return sig->stats;
517}
518
519/* Send pid data out on exit */
520void taskstats_exit(struct task_struct *tsk, int group_dead)
521{
522    int rc;
523    struct listener_list *listeners;
524    struct taskstats *stats;
525    struct sk_buff *rep_skb;
526    size_t size;
527    int is_thread_group;
528
529    if (!family_registered)
530        return;
531
532    /*
533     * Size includes space for nested attributes
534     */
535    size = nla_total_size(sizeof(u32)) +
536        nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
537
538    is_thread_group = !!taskstats_tgid_alloc(tsk);
539    if (is_thread_group) {
540        /* PID + STATS + TGID + STATS */
541        size = 2 * size;
542        /* fill the tsk->signal->stats structure */
543        fill_tgid_exit(tsk);
544    }
545
546    listeners = &__raw_get_cpu_var(listener_array);
547    if (list_empty(&listeners->list))
548        return;
549
550    rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
551    if (rc < 0)
552        return;
553
554    stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
555    if (!stats)
556        goto err;
557
558    rc = fill_pid(-1, tsk, stats);
559    if (rc < 0)
560        goto err;
561
562    /*
563     * Doesn't matter if tsk is the leader or the last group member leaving
564     */
565    if (!is_thread_group || !group_dead)
566        goto send;
567
568    stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
569    if (!stats)
570        goto err;
571
572    memcpy(stats, tsk->signal->stats, sizeof(*stats));
573
574send:
575    send_cpu_listeners(rep_skb, listeners);
576    return;
577err:
578    nlmsg_free(rep_skb);
579}
580
581static struct genl_ops taskstats_ops = {
582    .cmd = TASKSTATS_CMD_GET,
583    .doit = taskstats_user_cmd,
584    .policy = taskstats_cmd_get_policy,
585};
586
587static struct genl_ops cgroupstats_ops = {
588    .cmd = CGROUPSTATS_CMD_GET,
589    .doit = cgroupstats_user_cmd,
590    .policy = cgroupstats_cmd_get_policy,
591};
592
593/* Needed early in initialization */
594void __init taskstats_init_early(void)
595{
596    unsigned int i;
597
598    taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
599    for_each_possible_cpu(i) {
600        INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
601        init_rwsem(&(per_cpu(listener_array, i).sem));
602    }
603}
604
605static int __init taskstats_init(void)
606{
607    int rc;
608
609    rc = genl_register_family(&family);
610    if (rc)
611        return rc;
612
613    rc = genl_register_ops(&family, &taskstats_ops);
614    if (rc < 0)
615        goto err;
616
617    rc = genl_register_ops(&family, &cgroupstats_ops);
618    if (rc < 0)
619        goto err_cgroup_ops;
620
621    family_registered = 1;
622    printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
623    return 0;
624err_cgroup_ops:
625    genl_unregister_ops(&family, &taskstats_ops);
626err:
627    genl_unregister_family(&family);
628    return rc;
629}
630
631/*
632 * late initcall ensures initialization of statistics collection
633 * mechanisms precedes initialization of the taskstats interface
634 */
635late_initcall(taskstats_init);
636

Archive Download this file



interactive