Root/kernel/taskstats.c

1/*
2 * taskstats.c - Export per-task statistics to userland
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 * (C) Balbir Singh, IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h>
21#include <linux/tsacct_kern.h>
22#include <linux/delayacct.h>
23#include <linux/cpumask.h>
24#include <linux/percpu.h>
25#include <linux/cgroupstats.h>
26#include <linux/cgroup.h>
27#include <linux/fs.h>
28#include <linux/file.h>
29#include <net/genetlink.h>
30#include <asm/atomic.h>
31
32/*
33 * Maximum length of a cpumask that can be specified in
34 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
35 */
36#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
37
38static DEFINE_PER_CPU(__u32, taskstats_seqnum);
39static int family_registered;
40struct kmem_cache *taskstats_cache;
41
42static struct genl_family family = {
43    .id = GENL_ID_GENERATE,
44    .name = TASKSTATS_GENL_NAME,
45    .version = TASKSTATS_GENL_VERSION,
46    .maxattr = TASKSTATS_CMD_ATTR_MAX,
47};
48
49static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
50__read_mostly = {
51    [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
52    [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53    [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
54    [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
55
56static struct nla_policy
57cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
58    [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
59};
60
61struct listener {
62    struct list_head list;
63    pid_t pid;
64    char valid;
65};
66
67struct listener_list {
68    struct rw_semaphore sem;
69    struct list_head list;
70};
71static DEFINE_PER_CPU(struct listener_list, listener_array);
72
73enum actions {
74    REGISTER,
75    DEREGISTER,
76    CPU_DONT_CARE
77};
78
79static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
80                size_t size)
81{
82    struct sk_buff *skb;
83    void *reply;
84
85    /*
86     * If new attributes are added, please revisit this allocation
87     */
88    skb = genlmsg_new(size, GFP_KERNEL);
89    if (!skb)
90        return -ENOMEM;
91
92    if (!info) {
93        int seq = get_cpu_var(taskstats_seqnum)++;
94        put_cpu_var(taskstats_seqnum);
95
96        reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
97    } else
98        reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
99    if (reply == NULL) {
100        nlmsg_free(skb);
101        return -EINVAL;
102    }
103
104    *skbp = skb;
105    return 0;
106}
107
108/*
109 * Send taskstats data in @skb to listener with nl_pid @pid
110 */
111static int send_reply(struct sk_buff *skb, struct genl_info *info)
112{
113    struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114    void *reply = genlmsg_data(genlhdr);
115    int rc;
116
117    rc = genlmsg_end(skb, reply);
118    if (rc < 0) {
119        nlmsg_free(skb);
120        return rc;
121    }
122
123    return genlmsg_reply(skb, info);
124}
125
126/*
127 * Send taskstats data in @skb to listeners registered for @cpu's exit data
128 */
129static void send_cpu_listeners(struct sk_buff *skb,
130                    struct listener_list *listeners)
131{
132    struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
133    struct listener *s, *tmp;
134    struct sk_buff *skb_next, *skb_cur = skb;
135    void *reply = genlmsg_data(genlhdr);
136    int rc, delcount = 0;
137
138    rc = genlmsg_end(skb, reply);
139    if (rc < 0) {
140        nlmsg_free(skb);
141        return;
142    }
143
144    rc = 0;
145    down_read(&listeners->sem);
146    list_for_each_entry(s, &listeners->list, list) {
147        skb_next = NULL;
148        if (!list_is_last(&s->list, &listeners->list)) {
149            skb_next = skb_clone(skb_cur, GFP_KERNEL);
150            if (!skb_next)
151                break;
152        }
153        rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
154        if (rc == -ECONNREFUSED) {
155            s->valid = 0;
156            delcount++;
157        }
158        skb_cur = skb_next;
159    }
160    up_read(&listeners->sem);
161
162    if (skb_cur)
163        nlmsg_free(skb_cur);
164
165    if (!delcount)
166        return;
167
168    /* Delete invalidated entries */
169    down_write(&listeners->sem);
170    list_for_each_entry_safe(s, tmp, &listeners->list, list) {
171        if (!s->valid) {
172            list_del(&s->list);
173            kfree(s);
174        }
175    }
176    up_write(&listeners->sem);
177}
178
179static int fill_pid(pid_t pid, struct task_struct *tsk,
180        struct taskstats *stats)
181{
182    int rc = 0;
183
184    if (!tsk) {
185        rcu_read_lock();
186        tsk = find_task_by_vpid(pid);
187        if (tsk)
188            get_task_struct(tsk);
189        rcu_read_unlock();
190        if (!tsk)
191            return -ESRCH;
192    } else
193        get_task_struct(tsk);
194
195    memset(stats, 0, sizeof(*stats));
196    /*
197     * Each accounting subsystem adds calls to its functions to
198     * fill in relevant parts of struct taskstsats as follows
199     *
200     * per-task-foo(stats, tsk);
201     */
202
203    delayacct_add_tsk(stats, tsk);
204
205    /* fill in basic acct fields */
206    stats->version = TASKSTATS_VERSION;
207    stats->nvcsw = tsk->nvcsw;
208    stats->nivcsw = tsk->nivcsw;
209    bacct_add_tsk(stats, tsk);
210
211    /* fill in extended acct fields */
212    xacct_add_tsk(stats, tsk);
213
214    /* Define err: label here if needed */
215    put_task_struct(tsk);
216    return rc;
217
218}
219
220static int fill_tgid(pid_t tgid, struct task_struct *first,
221        struct taskstats *stats)
222{
223    struct task_struct *tsk;
224    unsigned long flags;
225    int rc = -ESRCH;
226
227    /*
228     * Add additional stats from live tasks except zombie thread group
229     * leaders who are already counted with the dead tasks
230     */
231    rcu_read_lock();
232    if (!first)
233        first = find_task_by_vpid(tgid);
234
235    if (!first || !lock_task_sighand(first, &flags))
236        goto out;
237
238    if (first->signal->stats)
239        memcpy(stats, first->signal->stats, sizeof(*stats));
240    else
241        memset(stats, 0, sizeof(*stats));
242
243    tsk = first;
244    do {
245        if (tsk->exit_state)
246            continue;
247        /*
248         * Accounting subsystem can call its functions here to
249         * fill in relevant parts of struct taskstsats as follows
250         *
251         * per-task-foo(stats, tsk);
252         */
253        delayacct_add_tsk(stats, tsk);
254
255        stats->nvcsw += tsk->nvcsw;
256        stats->nivcsw += tsk->nivcsw;
257    } while_each_thread(first, tsk);
258
259    unlock_task_sighand(first, &flags);
260    rc = 0;
261out:
262    rcu_read_unlock();
263
264    stats->version = TASKSTATS_VERSION;
265    /*
266     * Accounting subsystems can also add calls here to modify
267     * fields of taskstats.
268     */
269    return rc;
270}
271
272
273static void fill_tgid_exit(struct task_struct *tsk)
274{
275    unsigned long flags;
276
277    spin_lock_irqsave(&tsk->sighand->siglock, flags);
278    if (!tsk->signal->stats)
279        goto ret;
280
281    /*
282     * Each accounting subsystem calls its functions here to
283     * accumalate its per-task stats for tsk, into the per-tgid structure
284     *
285     * per-task-foo(tsk->signal->stats, tsk);
286     */
287    delayacct_add_tsk(tsk->signal->stats, tsk);
288ret:
289    spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
290    return;
291}
292
293static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
294{
295    struct listener_list *listeners;
296    struct listener *s, *tmp;
297    unsigned int cpu;
298
299    if (!cpumask_subset(mask, cpu_possible_mask))
300        return -EINVAL;
301
302    if (isadd == REGISTER) {
303        for_each_cpu(cpu, mask) {
304            s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
305                     cpu_to_node(cpu));
306            if (!s)
307                goto cleanup;
308            s->pid = pid;
309            INIT_LIST_HEAD(&s->list);
310            s->valid = 1;
311
312            listeners = &per_cpu(listener_array, cpu);
313            down_write(&listeners->sem);
314            list_add(&s->list, &listeners->list);
315            up_write(&listeners->sem);
316        }
317        return 0;
318    }
319
320    /* Deregister or cleanup */
321cleanup:
322    for_each_cpu(cpu, mask) {
323        listeners = &per_cpu(listener_array, cpu);
324        down_write(&listeners->sem);
325        list_for_each_entry_safe(s, tmp, &listeners->list, list) {
326            if (s->pid == pid) {
327                list_del(&s->list);
328                kfree(s);
329                break;
330            }
331        }
332        up_write(&listeners->sem);
333    }
334    return 0;
335}
336
337static int parse(struct nlattr *na, struct cpumask *mask)
338{
339    char *data;
340    int len;
341    int ret;
342
343    if (na == NULL)
344        return 1;
345    len = nla_len(na);
346    if (len > TASKSTATS_CPUMASK_MAXLEN)
347        return -E2BIG;
348    if (len < 1)
349        return -EINVAL;
350    data = kmalloc(len, GFP_KERNEL);
351    if (!data)
352        return -ENOMEM;
353    nla_strlcpy(data, na, len);
354    ret = cpulist_parse(data, mask);
355    kfree(data);
356    return ret;
357}
358
359static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
360{
361    struct nlattr *na, *ret;
362    int aggr;
363
364    aggr = (type == TASKSTATS_TYPE_PID)
365            ? TASKSTATS_TYPE_AGGR_PID
366            : TASKSTATS_TYPE_AGGR_TGID;
367
368    na = nla_nest_start(skb, aggr);
369    if (!na)
370        goto err;
371    if (nla_put(skb, type, sizeof(pid), &pid) < 0)
372        goto err;
373    ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
374    if (!ret)
375        goto err;
376    nla_nest_end(skb, na);
377
378    return nla_data(ret);
379err:
380    return NULL;
381}
382
383static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
384{
385    int rc = 0;
386    struct sk_buff *rep_skb;
387    struct cgroupstats *stats;
388    struct nlattr *na;
389    size_t size;
390    u32 fd;
391    struct file *file;
392    int fput_needed;
393
394    na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
395    if (!na)
396        return -EINVAL;
397
398    fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
399    file = fget_light(fd, &fput_needed);
400    if (!file)
401        return 0;
402
403    size = nla_total_size(sizeof(struct cgroupstats));
404
405    rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
406                size);
407    if (rc < 0)
408        goto err;
409
410    na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
411                sizeof(struct cgroupstats));
412    stats = nla_data(na);
413    memset(stats, 0, sizeof(*stats));
414
415    rc = cgroupstats_build(stats, file->f_dentry);
416    if (rc < 0) {
417        nlmsg_free(rep_skb);
418        goto err;
419    }
420
421    rc = send_reply(rep_skb, info);
422
423err:
424    fput_light(file, fput_needed);
425    return rc;
426}
427
428static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
429{
430    int rc;
431    struct sk_buff *rep_skb;
432    struct taskstats *stats;
433    size_t size;
434    cpumask_var_t mask;
435
436    if (!alloc_cpumask_var(&mask, GFP_KERNEL))
437        return -ENOMEM;
438
439    rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
440    if (rc < 0)
441        goto free_return_rc;
442    if (rc == 0) {
443        rc = add_del_listener(info->snd_pid, mask, REGISTER);
444        goto free_return_rc;
445    }
446
447    rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
448    if (rc < 0)
449        goto free_return_rc;
450    if (rc == 0) {
451        rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
452free_return_rc:
453        free_cpumask_var(mask);
454        return rc;
455    }
456    free_cpumask_var(mask);
457
458    /*
459     * Size includes space for nested attributes
460     */
461    size = nla_total_size(sizeof(u32)) +
462        nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
463
464    rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
465    if (rc < 0)
466        return rc;
467
468    rc = -EINVAL;
469    if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
470        u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
471        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
472        if (!stats)
473            goto err;
474
475        rc = fill_pid(pid, NULL, stats);
476        if (rc < 0)
477            goto err;
478    } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
479        u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
480        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
481        if (!stats)
482            goto err;
483
484        rc = fill_tgid(tgid, NULL, stats);
485        if (rc < 0)
486            goto err;
487    } else
488        goto err;
489
490    return send_reply(rep_skb, info);
491err:
492    nlmsg_free(rep_skb);
493    return rc;
494}
495
496static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
497{
498    struct signal_struct *sig = tsk->signal;
499    struct taskstats *stats;
500
501    if (sig->stats || thread_group_empty(tsk))
502        goto ret;
503
504    /* No problem if kmem_cache_zalloc() fails */
505    stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
506
507    spin_lock_irq(&tsk->sighand->siglock);
508    if (!sig->stats) {
509        sig->stats = stats;
510        stats = NULL;
511    }
512    spin_unlock_irq(&tsk->sighand->siglock);
513
514    if (stats)
515        kmem_cache_free(taskstats_cache, stats);
516ret:
517    return sig->stats;
518}
519
520/* Send pid data out on exit */
521void taskstats_exit(struct task_struct *tsk, int group_dead)
522{
523    int rc;
524    struct listener_list *listeners;
525    struct taskstats *stats;
526    struct sk_buff *rep_skb;
527    size_t size;
528    int is_thread_group;
529
530    if (!family_registered)
531        return;
532
533    /*
534     * Size includes space for nested attributes
535     */
536    size = nla_total_size(sizeof(u32)) +
537        nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
538
539    is_thread_group = !!taskstats_tgid_alloc(tsk);
540    if (is_thread_group) {
541        /* PID + STATS + TGID + STATS */
542        size = 2 * size;
543        /* fill the tsk->signal->stats structure */
544        fill_tgid_exit(tsk);
545    }
546
547    listeners = &__raw_get_cpu_var(listener_array);
548    if (list_empty(&listeners->list))
549        return;
550
551    rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
552    if (rc < 0)
553        return;
554
555    stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
556    if (!stats)
557        goto err;
558
559    rc = fill_pid(-1, tsk, stats);
560    if (rc < 0)
561        goto err;
562
563    /*
564     * Doesn't matter if tsk is the leader or the last group member leaving
565     */
566    if (!is_thread_group || !group_dead)
567        goto send;
568
569    stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
570    if (!stats)
571        goto err;
572
573    memcpy(stats, tsk->signal->stats, sizeof(*stats));
574
575send:
576    send_cpu_listeners(rep_skb, listeners);
577    return;
578err:
579    nlmsg_free(rep_skb);
580}
581
582static struct genl_ops taskstats_ops = {
583    .cmd = TASKSTATS_CMD_GET,
584    .doit = taskstats_user_cmd,
585    .policy = taskstats_cmd_get_policy,
586};
587
588static struct genl_ops cgroupstats_ops = {
589    .cmd = CGROUPSTATS_CMD_GET,
590    .doit = cgroupstats_user_cmd,
591    .policy = cgroupstats_cmd_get_policy,
592};
593
594/* Needed early in initialization */
595void __init taskstats_init_early(void)
596{
597    unsigned int i;
598
599    taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
600    for_each_possible_cpu(i) {
601        INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
602        init_rwsem(&(per_cpu(listener_array, i).sem));
603    }
604}
605
606static int __init taskstats_init(void)
607{
608    int rc;
609
610    rc = genl_register_family(&family);
611    if (rc)
612        return rc;
613
614    rc = genl_register_ops(&family, &taskstats_ops);
615    if (rc < 0)
616        goto err;
617
618    rc = genl_register_ops(&family, &cgroupstats_ops);
619    if (rc < 0)
620        goto err_cgroup_ops;
621
622    family_registered = 1;
623    printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
624    return 0;
625err_cgroup_ops:
626    genl_unregister_ops(&family, &taskstats_ops);
627err:
628    genl_unregister_family(&family);
629    return rc;
630}
631
632/*
633 * late initcall ensures initialization of statistics collection
634 * mechanisms precedes initialization of the taskstats interface
635 */
636late_initcall(taskstats_init);
637

Archive Download this file



interactive