Root/kernel/perf_event.c

1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/vmalloc.h>
24#include <linux/hardirq.h>
25#include <linux/rculist.h>
26#include <linux/uaccess.h>
27#include <linux/syscalls.h>
28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h>
31
32#include <asm/irq_regs.h>
33
34/*
35 * Each CPU has a list of per CPU events:
36 */
37DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
38
39int perf_max_events __read_mostly = 1;
40static int perf_reserved_percpu __read_mostly;
41static int perf_overcommit __read_mostly = 1;
42
43static atomic_t nr_events __read_mostly;
44static atomic_t nr_mmap_events __read_mostly;
45static atomic_t nr_comm_events __read_mostly;
46static atomic_t nr_task_events __read_mostly;
47
48/*
49 * perf event paranoia level:
50 * -1 - not paranoid at all
51 * 0 - disallow raw tracepoint access for unpriv
52 * 1 - disallow cpu events for unpriv
53 * 2 - disallow kernel profiling for unpriv
54 */
55int sysctl_perf_event_paranoid __read_mostly = 1;
56
57static inline bool perf_paranoid_tracepoint_raw(void)
58{
59    return sysctl_perf_event_paranoid > -1;
60}
61
62static inline bool perf_paranoid_cpu(void)
63{
64    return sysctl_perf_event_paranoid > 0;
65}
66
67static inline bool perf_paranoid_kernel(void)
68{
69    return sysctl_perf_event_paranoid > 1;
70}
71
72int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
73
74/*
75 * max perf event sample rate
76 */
77int sysctl_perf_event_sample_rate __read_mostly = 100000;
78
79static atomic64_t perf_event_id;
80
81/*
82 * Lock for (sysadmin-configurable) event reservations:
83 */
84static DEFINE_SPINLOCK(perf_resource_lock);
85
86/*
87 * Architecture provided APIs - weak aliases:
88 */
89extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
90{
91    return NULL;
92}
93
94void __weak hw_perf_disable(void) { barrier(); }
95void __weak hw_perf_enable(void) { barrier(); }
96
97void __weak hw_perf_event_setup(int cpu) { barrier(); }
98void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
99
100int __weak
101hw_perf_group_sched_in(struct perf_event *group_leader,
102           struct perf_cpu_context *cpuctx,
103           struct perf_event_context *ctx, int cpu)
104{
105    return 0;
106}
107
108void __weak perf_event_print_debug(void) { }
109
110static DEFINE_PER_CPU(int, perf_disable_count);
111
112void __perf_disable(void)
113{
114    __get_cpu_var(perf_disable_count)++;
115}
116
117bool __perf_enable(void)
118{
119    return !--__get_cpu_var(perf_disable_count);
120}
121
122void perf_disable(void)
123{
124    __perf_disable();
125    hw_perf_disable();
126}
127
128void perf_enable(void)
129{
130    if (__perf_enable())
131        hw_perf_enable();
132}
133
134static void get_ctx(struct perf_event_context *ctx)
135{
136    WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
137}
138
139static void free_ctx(struct rcu_head *head)
140{
141    struct perf_event_context *ctx;
142
143    ctx = container_of(head, struct perf_event_context, rcu_head);
144    kfree(ctx);
145}
146
147static void put_ctx(struct perf_event_context *ctx)
148{
149    if (atomic_dec_and_test(&ctx->refcount)) {
150        if (ctx->parent_ctx)
151            put_ctx(ctx->parent_ctx);
152        if (ctx->task)
153            put_task_struct(ctx->task);
154        call_rcu(&ctx->rcu_head, free_ctx);
155    }
156}
157
158static void unclone_ctx(struct perf_event_context *ctx)
159{
160    if (ctx->parent_ctx) {
161        put_ctx(ctx->parent_ctx);
162        ctx->parent_ctx = NULL;
163    }
164}
165
166/*
167 * If we inherit events we want to return the parent event id
168 * to userspace.
169 */
170static u64 primary_event_id(struct perf_event *event)
171{
172    u64 id = event->id;
173
174    if (event->parent)
175        id = event->parent->id;
176
177    return id;
178}
179
180/*
181 * Get the perf_event_context for a task and lock it.
182 * This has to cope with with the fact that until it is locked,
183 * the context could get moved to another task.
184 */
185static struct perf_event_context *
186perf_lock_task_context(struct task_struct *task, unsigned long *flags)
187{
188    struct perf_event_context *ctx;
189
190    rcu_read_lock();
191 retry:
192    ctx = rcu_dereference(task->perf_event_ctxp);
193    if (ctx) {
194        /*
195         * If this context is a clone of another, it might
196         * get swapped for another underneath us by
197         * perf_event_task_sched_out, though the
198         * rcu_read_lock() protects us from any context
199         * getting freed. Lock the context and check if it
200         * got swapped before we could get the lock, and retry
201         * if so. If we locked the right context, then it
202         * can't get swapped on us any more.
203         */
204        spin_lock_irqsave(&ctx->lock, *flags);
205        if (ctx != rcu_dereference(task->perf_event_ctxp)) {
206            spin_unlock_irqrestore(&ctx->lock, *flags);
207            goto retry;
208        }
209
210        if (!atomic_inc_not_zero(&ctx->refcount)) {
211            spin_unlock_irqrestore(&ctx->lock, *flags);
212            ctx = NULL;
213        }
214    }
215    rcu_read_unlock();
216    return ctx;
217}
218
219/*
220 * Get the context for a task and increment its pin_count so it
221 * can't get swapped to another task. This also increments its
222 * reference count so that the context can't get freed.
223 */
224static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
225{
226    struct perf_event_context *ctx;
227    unsigned long flags;
228
229    ctx = perf_lock_task_context(task, &flags);
230    if (ctx) {
231        ++ctx->pin_count;
232        spin_unlock_irqrestore(&ctx->lock, flags);
233    }
234    return ctx;
235}
236
237static void perf_unpin_context(struct perf_event_context *ctx)
238{
239    unsigned long flags;
240
241    spin_lock_irqsave(&ctx->lock, flags);
242    --ctx->pin_count;
243    spin_unlock_irqrestore(&ctx->lock, flags);
244    put_ctx(ctx);
245}
246
247/*
248 * Add a event from the lists for its context.
249 * Must be called with ctx->mutex and ctx->lock held.
250 */
251static void
252list_add_event(struct perf_event *event, struct perf_event_context *ctx)
253{
254    struct perf_event *group_leader = event->group_leader;
255
256    /*
257     * Depending on whether it is a standalone or sibling event,
258     * add it straight to the context's event list, or to the group
259     * leader's sibling list:
260     */
261    if (group_leader == event)
262        list_add_tail(&event->group_entry, &ctx->group_list);
263    else {
264        list_add_tail(&event->group_entry, &group_leader->sibling_list);
265        group_leader->nr_siblings++;
266    }
267
268    list_add_rcu(&event->event_entry, &ctx->event_list);
269    ctx->nr_events++;
270    if (event->attr.inherit_stat)
271        ctx->nr_stat++;
272}
273
274/*
275 * Remove a event from the lists for its context.
276 * Must be called with ctx->mutex and ctx->lock held.
277 */
278static void
279list_del_event(struct perf_event *event, struct perf_event_context *ctx)
280{
281    struct perf_event *sibling, *tmp;
282
283    if (list_empty(&event->group_entry))
284        return;
285    ctx->nr_events--;
286    if (event->attr.inherit_stat)
287        ctx->nr_stat--;
288
289    list_del_init(&event->group_entry);
290    list_del_rcu(&event->event_entry);
291
292    if (event->group_leader != event)
293        event->group_leader->nr_siblings--;
294
295    /*
296     * If this was a group event with sibling events then
297     * upgrade the siblings to singleton events by adding them
298     * to the context list directly:
299     */
300    list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
301
302        list_move_tail(&sibling->group_entry, &ctx->group_list);
303        sibling->group_leader = sibling;
304    }
305}
306
307static void
308event_sched_out(struct perf_event *event,
309          struct perf_cpu_context *cpuctx,
310          struct perf_event_context *ctx)
311{
312    if (event->state != PERF_EVENT_STATE_ACTIVE)
313        return;
314
315    event->state = PERF_EVENT_STATE_INACTIVE;
316    if (event->pending_disable) {
317        event->pending_disable = 0;
318        event->state = PERF_EVENT_STATE_OFF;
319    }
320    event->tstamp_stopped = ctx->time;
321    event->pmu->disable(event);
322    event->oncpu = -1;
323
324    if (!is_software_event(event))
325        cpuctx->active_oncpu--;
326    ctx->nr_active--;
327    if (event->attr.exclusive || !cpuctx->active_oncpu)
328        cpuctx->exclusive = 0;
329}
330
331static void
332group_sched_out(struct perf_event *group_event,
333        struct perf_cpu_context *cpuctx,
334        struct perf_event_context *ctx)
335{
336    struct perf_event *event;
337
338    if (group_event->state != PERF_EVENT_STATE_ACTIVE)
339        return;
340
341    event_sched_out(group_event, cpuctx, ctx);
342
343    /*
344     * Schedule out siblings (if any):
345     */
346    list_for_each_entry(event, &group_event->sibling_list, group_entry)
347        event_sched_out(event, cpuctx, ctx);
348
349    if (group_event->attr.exclusive)
350        cpuctx->exclusive = 0;
351}
352
353/*
354 * Cross CPU call to remove a performance event
355 *
356 * We disable the event on the hardware level first. After that we
357 * remove it from the context list.
358 */
359static void __perf_event_remove_from_context(void *info)
360{
361    struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
362    struct perf_event *event = info;
363    struct perf_event_context *ctx = event->ctx;
364
365    /*
366     * If this is a task context, we need to check whether it is
367     * the current task context of this cpu. If not it has been
368     * scheduled out before the smp call arrived.
369     */
370    if (ctx->task && cpuctx->task_ctx != ctx)
371        return;
372
373    spin_lock(&ctx->lock);
374    /*
375     * Protect the list operation against NMI by disabling the
376     * events on a global level.
377     */
378    perf_disable();
379
380    event_sched_out(event, cpuctx, ctx);
381
382    list_del_event(event, ctx);
383
384    if (!ctx->task) {
385        /*
386         * Allow more per task events with respect to the
387         * reservation:
388         */
389        cpuctx->max_pertask =
390            min(perf_max_events - ctx->nr_events,
391                perf_max_events - perf_reserved_percpu);
392    }
393
394    perf_enable();
395    spin_unlock(&ctx->lock);
396}
397
398
399/*
400 * Remove the event from a task's (or a CPU's) list of events.
401 *
402 * Must be called with ctx->mutex held.
403 *
404 * CPU events are removed with a smp call. For task events we only
405 * call when the task is on a CPU.
406 *
407 * If event->ctx is a cloned context, callers must make sure that
408 * every task struct that event->ctx->task could possibly point to
409 * remains valid. This is OK when called from perf_release since
410 * that only calls us on the top-level context, which can't be a clone.
411 * When called from perf_event_exit_task, it's OK because the
412 * context has been detached from its task.
413 */
414static void perf_event_remove_from_context(struct perf_event *event)
415{
416    struct perf_event_context *ctx = event->ctx;
417    struct task_struct *task = ctx->task;
418
419    if (!task) {
420        /*
421         * Per cpu events are removed via an smp call and
422         * the removal is always sucessful.
423         */
424        smp_call_function_single(event->cpu,
425                     __perf_event_remove_from_context,
426                     event, 1);
427        return;
428    }
429
430retry:
431    task_oncpu_function_call(task, __perf_event_remove_from_context,
432                 event);
433
434    spin_lock_irq(&ctx->lock);
435    /*
436     * If the context is active we need to retry the smp call.
437     */
438    if (ctx->nr_active && !list_empty(&event->group_entry)) {
439        spin_unlock_irq(&ctx->lock);
440        goto retry;
441    }
442
443    /*
444     * The lock prevents that this context is scheduled in so we
445     * can remove the event safely, if the call above did not
446     * succeed.
447     */
448    if (!list_empty(&event->group_entry)) {
449        list_del_event(event, ctx);
450    }
451    spin_unlock_irq(&ctx->lock);
452}
453
454static inline u64 perf_clock(void)
455{
456    return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_event_context *ctx)
463{
464    u64 now = perf_clock();
465
466    ctx->time += now - ctx->timestamp;
467    ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a event.
472 */
473static void update_event_times(struct perf_event *event)
474{
475    struct perf_event_context *ctx = event->ctx;
476    u64 run_end;
477
478    if (event->state < PERF_EVENT_STATE_INACTIVE ||
479        event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
480        return;
481
482    event->total_time_enabled = ctx->time - event->tstamp_enabled;
483
484    if (event->state == PERF_EVENT_STATE_INACTIVE)
485        run_end = event->tstamp_stopped;
486    else
487        run_end = ctx->time;
488
489    event->total_time_running = run_end - event->tstamp_running;
490}
491
492/*
493 * Update total_time_enabled and total_time_running for all events in a group.
494 */
495static void update_group_times(struct perf_event *leader)
496{
497    struct perf_event *event;
498
499    update_event_times(leader);
500    list_for_each_entry(event, &leader->sibling_list, group_entry)
501        update_event_times(event);
502}
503
504/*
505 * Cross CPU call to disable a performance event
506 */
507static void __perf_event_disable(void *info)
508{
509    struct perf_event *event = info;
510    struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
511    struct perf_event_context *ctx = event->ctx;
512
513    /*
514     * If this is a per-task event, need to check whether this
515     * event's task is the current task on this cpu.
516     */
517    if (ctx->task && cpuctx->task_ctx != ctx)
518        return;
519
520    spin_lock(&ctx->lock);
521
522    /*
523     * If the event is on, turn it off.
524     * If it is in error state, leave it in error state.
525     */
526    if (event->state >= PERF_EVENT_STATE_INACTIVE) {
527        update_context_time(ctx);
528        update_group_times(event);
529        if (event == event->group_leader)
530            group_sched_out(event, cpuctx, ctx);
531        else
532            event_sched_out(event, cpuctx, ctx);
533        event->state = PERF_EVENT_STATE_OFF;
534    }
535
536    spin_unlock(&ctx->lock);
537}
538
539/*
540 * Disable a event.
541 *
542 * If event->ctx is a cloned context, callers must make sure that
543 * every task struct that event->ctx->task could possibly point to
544 * remains valid. This condition is satisifed when called through
545 * perf_event_for_each_child or perf_event_for_each because they
546 * hold the top-level event's child_mutex, so any descendant that
547 * goes to exit will block in sync_child_event.
548 * When called from perf_pending_event it's OK because event->ctx
549 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_event_task_sched_out for this context.
551 */
552static void perf_event_disable(struct perf_event *event)
553{
554    struct perf_event_context *ctx = event->ctx;
555    struct task_struct *task = ctx->task;
556
557    if (!task) {
558        /*
559         * Disable the event on the cpu that it's on
560         */
561        smp_call_function_single(event->cpu, __perf_event_disable,
562                     event, 1);
563        return;
564    }
565
566 retry:
567    task_oncpu_function_call(task, __perf_event_disable, event);
568
569    spin_lock_irq(&ctx->lock);
570    /*
571     * If the event is still active, we need to retry the cross-call.
572     */
573    if (event->state == PERF_EVENT_STATE_ACTIVE) {
574        spin_unlock_irq(&ctx->lock);
575        goto retry;
576    }
577
578    /*
579     * Since we have the lock this context can't be scheduled
580     * in, so we can change the state safely.
581     */
582    if (event->state == PERF_EVENT_STATE_INACTIVE) {
583        update_group_times(event);
584        event->state = PERF_EVENT_STATE_OFF;
585    }
586
587    spin_unlock_irq(&ctx->lock);
588}
589
590static int
591event_sched_in(struct perf_event *event,
592         struct perf_cpu_context *cpuctx,
593         struct perf_event_context *ctx,
594         int cpu)
595{
596    if (event->state <= PERF_EVENT_STATE_OFF)
597        return 0;
598
599    event->state = PERF_EVENT_STATE_ACTIVE;
600    event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
601    /*
602     * The new state must be visible before we turn it on in the hardware:
603     */
604    smp_wmb();
605
606    if (event->pmu->enable(event)) {
607        event->state = PERF_EVENT_STATE_INACTIVE;
608        event->oncpu = -1;
609        return -EAGAIN;
610    }
611
612    event->tstamp_running += ctx->time - event->tstamp_stopped;
613
614    if (!is_software_event(event))
615        cpuctx->active_oncpu++;
616    ctx->nr_active++;
617
618    if (event->attr.exclusive)
619        cpuctx->exclusive = 1;
620
621    return 0;
622}
623
624static int
625group_sched_in(struct perf_event *group_event,
626           struct perf_cpu_context *cpuctx,
627           struct perf_event_context *ctx,
628           int cpu)
629{
630    struct perf_event *event, *partial_group;
631    int ret;
632
633    if (group_event->state == PERF_EVENT_STATE_OFF)
634        return 0;
635
636    ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
637    if (ret)
638        return ret < 0 ? ret : 0;
639
640    if (event_sched_in(group_event, cpuctx, ctx, cpu))
641        return -EAGAIN;
642
643    /*
644     * Schedule in siblings as one group (if any):
645     */
646    list_for_each_entry(event, &group_event->sibling_list, group_entry) {
647        if (event_sched_in(event, cpuctx, ctx, cpu)) {
648            partial_group = event;
649            goto group_error;
650        }
651    }
652
653    return 0;
654
655group_error:
656    /*
657     * Groups can be scheduled in as one unit only, so undo any
658     * partial group before returning:
659     */
660    list_for_each_entry(event, &group_event->sibling_list, group_entry) {
661        if (event == partial_group)
662            break;
663        event_sched_out(event, cpuctx, ctx);
664    }
665    event_sched_out(group_event, cpuctx, ctx);
666
667    return -EAGAIN;
668}
669
670/*
671 * Return 1 for a group consisting entirely of software events,
672 * 0 if the group contains any hardware events.
673 */
674static int is_software_only_group(struct perf_event *leader)
675{
676    struct perf_event *event;
677
678    if (!is_software_event(leader))
679        return 0;
680
681    list_for_each_entry(event, &leader->sibling_list, group_entry)
682        if (!is_software_event(event))
683            return 0;
684
685    return 1;
686}
687
688/*
689 * Work out whether we can put this event group on the CPU now.
690 */
691static int group_can_go_on(struct perf_event *event,
692               struct perf_cpu_context *cpuctx,
693               int can_add_hw)
694{
695    /*
696     * Groups consisting entirely of software events can always go on.
697     */
698    if (is_software_only_group(event))
699        return 1;
700    /*
701     * If an exclusive group is already on, no other hardware
702     * events can go on.
703     */
704    if (cpuctx->exclusive)
705        return 0;
706    /*
707     * If this group is exclusive and there are already
708     * events on the CPU, it can't go on.
709     */
710    if (event->attr.exclusive && cpuctx->active_oncpu)
711        return 0;
712    /*
713     * Otherwise, try to add it if all previous groups were able
714     * to go on.
715     */
716    return can_add_hw;
717}
718
719static void add_event_to_ctx(struct perf_event *event,
720                   struct perf_event_context *ctx)
721{
722    list_add_event(event, ctx);
723    event->tstamp_enabled = ctx->time;
724    event->tstamp_running = ctx->time;
725    event->tstamp_stopped = ctx->time;
726}
727
728/*
729 * Cross CPU call to install and enable a performance event
730 *
731 * Must be called with ctx->mutex held
732 */
733static void __perf_install_in_context(void *info)
734{
735    struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
736    struct perf_event *event = info;
737    struct perf_event_context *ctx = event->ctx;
738    struct perf_event *leader = event->group_leader;
739    int cpu = smp_processor_id();
740    int err;
741
742    /*
743     * If this is a task context, we need to check whether it is
744     * the current task context of this cpu. If not it has been
745     * scheduled out before the smp call arrived.
746     * Or possibly this is the right context but it isn't
747     * on this cpu because it had no events.
748     */
749    if (ctx->task && cpuctx->task_ctx != ctx) {
750        if (cpuctx->task_ctx || ctx->task != current)
751            return;
752        cpuctx->task_ctx = ctx;
753    }
754
755    spin_lock(&ctx->lock);
756    ctx->is_active = 1;
757    update_context_time(ctx);
758
759    /*
760     * Protect the list operation against NMI by disabling the
761     * events on a global level. NOP for non NMI based events.
762     */
763    perf_disable();
764
765    add_event_to_ctx(event, ctx);
766
767    /*
768     * Don't put the event on if it is disabled or if
769     * it is in a group and the group isn't on.
770     */
771    if (event->state != PERF_EVENT_STATE_INACTIVE ||
772        (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
773        goto unlock;
774
775    /*
776     * An exclusive event can't go on if there are already active
777     * hardware events, and no hardware event can go on if there
778     * is already an exclusive event on.
779     */
780    if (!group_can_go_on(event, cpuctx, 1))
781        err = -EEXIST;
782    else
783        err = event_sched_in(event, cpuctx, ctx, cpu);
784
785    if (err) {
786        /*
787         * This event couldn't go on. If it is in a group
788         * then we have to pull the whole group off.
789         * If the event group is pinned then put it in error state.
790         */
791        if (leader != event)
792            group_sched_out(leader, cpuctx, ctx);
793        if (leader->attr.pinned) {
794            update_group_times(leader);
795            leader->state = PERF_EVENT_STATE_ERROR;
796        }
797    }
798
799    if (!err && !ctx->task && cpuctx->max_pertask)
800        cpuctx->max_pertask--;
801
802 unlock:
803    perf_enable();
804
805    spin_unlock(&ctx->lock);
806}
807
808/*
809 * Attach a performance event to a context
810 *
811 * First we add the event to the list with the hardware enable bit
812 * in event->hw_config cleared.
813 *
814 * If the event is attached to a task which is on a CPU we use a smp
815 * call to enable it in the task context. The task might have been
816 * scheduled away, but we check this in the smp call again.
817 *
818 * Must be called with ctx->mutex held.
819 */
820static void
821perf_install_in_context(struct perf_event_context *ctx,
822            struct perf_event *event,
823            int cpu)
824{
825    struct task_struct *task = ctx->task;
826
827    if (!task) {
828        /*
829         * Per cpu events are installed via an smp call and
830         * the install is always sucessful.
831         */
832        smp_call_function_single(cpu, __perf_install_in_context,
833                     event, 1);
834        return;
835    }
836
837retry:
838    task_oncpu_function_call(task, __perf_install_in_context,
839                 event);
840
841    spin_lock_irq(&ctx->lock);
842    /*
843     * we need to retry the smp call.
844     */
845    if (ctx->is_active && list_empty(&event->group_entry)) {
846        spin_unlock_irq(&ctx->lock);
847        goto retry;
848    }
849
850    /*
851     * The lock prevents that this context is scheduled in so we
852     * can add the event safely, if it the call above did not
853     * succeed.
854     */
855    if (list_empty(&event->group_entry))
856        add_event_to_ctx(event, ctx);
857    spin_unlock_irq(&ctx->lock);
858}
859
860/*
861 * Put a event into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_event_mark_enabled(struct perf_event *event,
869                    struct perf_event_context *ctx)
870{
871    struct perf_event *sub;
872
873    event->state = PERF_EVENT_STATE_INACTIVE;
874    event->tstamp_enabled = ctx->time - event->total_time_enabled;
875    list_for_each_entry(sub, &event->sibling_list, group_entry)
876        if (sub->state >= PERF_EVENT_STATE_INACTIVE)
877            sub->tstamp_enabled =
878                ctx->time - sub->total_time_enabled;
879}
880
881/*
882 * Cross CPU call to enable a performance event
883 */
884static void __perf_event_enable(void *info)
885{
886    struct perf_event *event = info;
887    struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
888    struct perf_event_context *ctx = event->ctx;
889    struct perf_event *leader = event->group_leader;
890    int err;
891
892    /*
893     * If this is a per-task event, need to check whether this
894     * event's task is the current task on this cpu.
895     */
896    if (ctx->task && cpuctx->task_ctx != ctx) {
897        if (cpuctx->task_ctx || ctx->task != current)
898            return;
899        cpuctx->task_ctx = ctx;
900    }
901
902    spin_lock(&ctx->lock);
903    ctx->is_active = 1;
904    update_context_time(ctx);
905
906    if (event->state >= PERF_EVENT_STATE_INACTIVE)
907        goto unlock;
908    __perf_event_mark_enabled(event, ctx);
909
910    /*
911     * If the event is in a group and isn't the group leader,
912     * then don't put it on unless the group is on.
913     */
914    if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
915        goto unlock;
916
917    if (!group_can_go_on(event, cpuctx, 1)) {
918        err = -EEXIST;
919    } else {
920        perf_disable();
921        if (event == leader)
922            err = group_sched_in(event, cpuctx, ctx,
923                         smp_processor_id());
924        else
925            err = event_sched_in(event, cpuctx, ctx,
926                           smp_processor_id());
927        perf_enable();
928    }
929
930    if (err) {
931        /*
932         * If this event can't go on and it's part of a
933         * group, then the whole group has to come off.
934         */
935        if (leader != event)
936            group_sched_out(leader, cpuctx, ctx);
937        if (leader->attr.pinned) {
938            update_group_times(leader);
939            leader->state = PERF_EVENT_STATE_ERROR;
940        }
941    }
942
943 unlock:
944    spin_unlock(&ctx->lock);
945}
946
947/*
948 * Enable a event.
949 *
950 * If event->ctx is a cloned context, callers must make sure that
951 * every task struct that event->ctx->task could possibly point to
952 * remains valid. This condition is satisfied when called through
953 * perf_event_for_each_child or perf_event_for_each as described
954 * for perf_event_disable.
955 */
956static void perf_event_enable(struct perf_event *event)
957{
958    struct perf_event_context *ctx = event->ctx;
959    struct task_struct *task = ctx->task;
960
961    if (!task) {
962        /*
963         * Enable the event on the cpu that it's on
964         */
965        smp_call_function_single(event->cpu, __perf_event_enable,
966                     event, 1);
967        return;
968    }
969
970    spin_lock_irq(&ctx->lock);
971    if (event->state >= PERF_EVENT_STATE_INACTIVE)
972        goto out;
973
974    /*
975     * If the event is in error state, clear that first.
976     * That way, if we see the event in error state below, we
977     * know that it has gone back into error state, as distinct
978     * from the task having been scheduled away before the
979     * cross-call arrived.
980     */
981    if (event->state == PERF_EVENT_STATE_ERROR)
982        event->state = PERF_EVENT_STATE_OFF;
983
984 retry:
985    spin_unlock_irq(&ctx->lock);
986    task_oncpu_function_call(task, __perf_event_enable, event);
987
988    spin_lock_irq(&ctx->lock);
989
990    /*
991     * If the context is active and the event is still off,
992     * we need to retry the cross-call.
993     */
994    if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
995        goto retry;
996
997    /*
998     * Since we have the lock this context can't be scheduled
999     * in, so we can change the state safely.
1000     */
1001    if (event->state == PERF_EVENT_STATE_OFF)
1002        __perf_event_mark_enabled(event, ctx);
1003
1004 out:
1005    spin_unlock_irq(&ctx->lock);
1006}
1007
1008static int perf_event_refresh(struct perf_event *event, int refresh)
1009{
1010    /*
1011     * not supported on inherited events
1012     */
1013    if (event->attr.inherit)
1014        return -EINVAL;
1015
1016    atomic_add(refresh, &event->event_limit);
1017    perf_event_enable(event);
1018
1019    return 0;
1020}
1021
1022void __perf_event_sched_out(struct perf_event_context *ctx,
1023                  struct perf_cpu_context *cpuctx)
1024{
1025    struct perf_event *event;
1026
1027    spin_lock(&ctx->lock);
1028    ctx->is_active = 0;
1029    if (likely(!ctx->nr_events))
1030        goto out;
1031    update_context_time(ctx);
1032
1033    perf_disable();
1034    if (ctx->nr_active)
1035        list_for_each_entry(event, &ctx->group_list, group_entry)
1036            group_sched_out(event, cpuctx, ctx);
1037
1038    perf_enable();
1039 out:
1040    spin_unlock(&ctx->lock);
1041}
1042
1043/*
1044 * Test whether two contexts are equivalent, i.e. whether they
1045 * have both been cloned from the same version of the same context
1046 * and they both have the same number of enabled events.
1047 * If the number of enabled events is the same, then the set
1048 * of enabled events should be the same, because these are both
1049 * inherited contexts, therefore we can't access individual events
1050 * in them directly with an fd; we can only enable/disable all
1051 * events via prctl, or enable/disable all events in a family
1052 * via ioctl, which will have the same effect on both contexts.
1053 */
1054static int context_equiv(struct perf_event_context *ctx1,
1055             struct perf_event_context *ctx2)
1056{
1057    return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1058        && ctx1->parent_gen == ctx2->parent_gen
1059        && !ctx1->pin_count && !ctx2->pin_count;
1060}
1061
1062static void __perf_event_read(void *event);
1063
1064static void __perf_event_sync_stat(struct perf_event *event,
1065                     struct perf_event *next_event)
1066{
1067    u64 value;
1068
1069    if (!event->attr.inherit_stat)
1070        return;
1071
1072    /*
1073     * Update the event value, we cannot use perf_event_read()
1074     * because we're in the middle of a context switch and have IRQs
1075     * disabled, which upsets smp_call_function_single(), however
1076     * we know the event must be on the current CPU, therefore we
1077     * don't need to use it.
1078     */
1079    switch (event->state) {
1080    case PERF_EVENT_STATE_ACTIVE:
1081        __perf_event_read(event);
1082        break;
1083
1084    case PERF_EVENT_STATE_INACTIVE:
1085        update_event_times(event);
1086        break;
1087
1088    default:
1089        break;
1090    }
1091
1092    /*
1093     * In order to keep per-task stats reliable we need to flip the event
1094     * values when we flip the contexts.
1095     */
1096    value = atomic64_read(&next_event->count);
1097    value = atomic64_xchg(&event->count, value);
1098    atomic64_set(&next_event->count, value);
1099
1100    swap(event->total_time_enabled, next_event->total_time_enabled);
1101    swap(event->total_time_running, next_event->total_time_running);
1102
1103    /*
1104     * Since we swizzled the values, update the user visible data too.
1105     */
1106    perf_event_update_userpage(event);
1107    perf_event_update_userpage(next_event);
1108}
1109
1110#define list_next_entry(pos, member) \
1111    list_entry(pos->member.next, typeof(*pos), member)
1112
1113static void perf_event_sync_stat(struct perf_event_context *ctx,
1114                   struct perf_event_context *next_ctx)
1115{
1116    struct perf_event *event, *next_event;
1117
1118    if (!ctx->nr_stat)
1119        return;
1120
1121    event = list_first_entry(&ctx->event_list,
1122                   struct perf_event, event_entry);
1123
1124    next_event = list_first_entry(&next_ctx->event_list,
1125                    struct perf_event, event_entry);
1126
1127    while (&event->event_entry != &ctx->event_list &&
1128           &next_event->event_entry != &next_ctx->event_list) {
1129
1130        __perf_event_sync_stat(event, next_event);
1131
1132        event = list_next_entry(event, event_entry);
1133        next_event = list_next_entry(next_event, event_entry);
1134    }
1135}
1136
1137/*
1138 * Called from scheduler to remove the events of the current task,
1139 * with interrupts disabled.
1140 *
1141 * We stop each event and update the event value in event->count.
1142 *
1143 * This does not protect us against NMI, but disable()
1144 * sets the disabled bit in the control field of event _before_
1145 * accessing the event control register. If a NMI hits, then it will
1146 * not restart the event.
1147 */
1148void perf_event_task_sched_out(struct task_struct *task,
1149                 struct task_struct *next, int cpu)
1150{
1151    struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1152    struct perf_event_context *ctx = task->perf_event_ctxp;
1153    struct perf_event_context *next_ctx;
1154    struct perf_event_context *parent;
1155    struct pt_regs *regs;
1156    int do_switch = 1;
1157
1158    regs = task_pt_regs(task);
1159    perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1160
1161    if (likely(!ctx || !cpuctx->task_ctx))
1162        return;
1163
1164    update_context_time(ctx);
1165
1166    rcu_read_lock();
1167    parent = rcu_dereference(ctx->parent_ctx);
1168    next_ctx = next->perf_event_ctxp;
1169    if (parent && next_ctx &&
1170        rcu_dereference(next_ctx->parent_ctx) == parent) {
1171        /*
1172         * Looks like the two contexts are clones, so we might be
1173         * able to optimize the context switch. We lock both
1174         * contexts and check that they are clones under the
1175         * lock (including re-checking that neither has been
1176         * uncloned in the meantime). It doesn't matter which
1177         * order we take the locks because no other cpu could
1178         * be trying to lock both of these tasks.
1179         */
1180        spin_lock(&ctx->lock);
1181        spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1182        if (context_equiv(ctx, next_ctx)) {
1183            /*
1184             * XXX do we need a memory barrier of sorts
1185             * wrt to rcu_dereference() of perf_event_ctxp
1186             */
1187            task->perf_event_ctxp = next_ctx;
1188            next->perf_event_ctxp = ctx;
1189            ctx->task = next;
1190            next_ctx->task = task;
1191            do_switch = 0;
1192
1193            perf_event_sync_stat(ctx, next_ctx);
1194        }
1195        spin_unlock(&next_ctx->lock);
1196        spin_unlock(&ctx->lock);
1197    }
1198    rcu_read_unlock();
1199
1200    if (do_switch) {
1201        __perf_event_sched_out(ctx, cpuctx);
1202        cpuctx->task_ctx = NULL;
1203    }
1204}
1205
1206/*
1207 * Called with IRQs disabled
1208 */
1209static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1210{
1211    struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1212
1213    if (!cpuctx->task_ctx)
1214        return;
1215
1216    if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1217        return;
1218
1219    __perf_event_sched_out(ctx, cpuctx);
1220    cpuctx->task_ctx = NULL;
1221}
1222
1223/*
1224 * Called with IRQs disabled
1225 */
1226static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1227{
1228    __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1229}
1230
1231static void
1232__perf_event_sched_in(struct perf_event_context *ctx,
1233            struct perf_cpu_context *cpuctx, int cpu)
1234{
1235    struct perf_event *event;
1236    int can_add_hw = 1;
1237
1238    spin_lock(&ctx->lock);
1239    ctx->is_active = 1;
1240    if (likely(!ctx->nr_events))
1241        goto out;
1242
1243    ctx->timestamp = perf_clock();
1244
1245    perf_disable();
1246
1247    /*
1248     * First go through the list and put on any pinned groups
1249     * in order to give them the best chance of going on.
1250     */
1251    list_for_each_entry(event, &ctx->group_list, group_entry) {
1252        if (event->state <= PERF_EVENT_STATE_OFF ||
1253            !event->attr.pinned)
1254            continue;
1255        if (event->cpu != -1 && event->cpu != cpu)
1256            continue;
1257
1258        if (group_can_go_on(event, cpuctx, 1))
1259            group_sched_in(event, cpuctx, ctx, cpu);
1260
1261        /*
1262         * If this pinned group hasn't been scheduled,
1263         * put it in error state.
1264         */
1265        if (event->state == PERF_EVENT_STATE_INACTIVE) {
1266            update_group_times(event);
1267            event->state = PERF_EVENT_STATE_ERROR;
1268        }
1269    }
1270
1271    list_for_each_entry(event, &ctx->group_list, group_entry) {
1272        /*
1273         * Ignore events in OFF or ERROR state, and
1274         * ignore pinned events since we did them already.
1275         */
1276        if (event->state <= PERF_EVENT_STATE_OFF ||
1277            event->attr.pinned)
1278            continue;
1279
1280        /*
1281         * Listen to the 'cpu' scheduling filter constraint
1282         * of events:
1283         */
1284        if (event->cpu != -1 && event->cpu != cpu)
1285            continue;
1286
1287        if (group_can_go_on(event, cpuctx, can_add_hw))
1288            if (group_sched_in(event, cpuctx, ctx, cpu))
1289                can_add_hw = 0;
1290    }
1291    perf_enable();
1292 out:
1293    spin_unlock(&ctx->lock);
1294}
1295
1296/*
1297 * Called from scheduler to add the events of the current task
1298 * with interrupts disabled.
1299 *
1300 * We restore the event value and then enable it.
1301 *
1302 * This does not protect us against NMI, but enable()
1303 * sets the enabled bit in the control field of event _before_
1304 * accessing the event control register. If a NMI hits, then it will
1305 * keep the event running.
1306 */
1307void perf_event_task_sched_in(struct task_struct *task, int cpu)
1308{
1309    struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1310    struct perf_event_context *ctx = task->perf_event_ctxp;
1311
1312    if (likely(!ctx))
1313        return;
1314    if (cpuctx->task_ctx == ctx)
1315        return;
1316    __perf_event_sched_in(ctx, cpuctx, cpu);
1317    cpuctx->task_ctx = ctx;
1318}
1319
1320static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1321{
1322    struct perf_event_context *ctx = &cpuctx->ctx;
1323
1324    __perf_event_sched_in(ctx, cpuctx, cpu);
1325}
1326
1327#define MAX_INTERRUPTS (~0ULL)
1328
1329static void perf_log_throttle(struct perf_event *event, int enable);
1330
1331static void perf_adjust_period(struct perf_event *event, u64 events)
1332{
1333    struct hw_perf_event *hwc = &event->hw;
1334    u64 period, sample_period;
1335    s64 delta;
1336
1337    events *= hwc->sample_period;
1338    period = div64_u64(events, event->attr.sample_freq);
1339
1340    delta = (s64)(period - hwc->sample_period);
1341    delta = (delta + 7) / 8; /* low pass filter */
1342
1343    sample_period = hwc->sample_period + delta;
1344
1345    if (!sample_period)
1346        sample_period = 1;
1347
1348    hwc->sample_period = sample_period;
1349}
1350
1351static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1352{
1353    struct perf_event *event;
1354    struct hw_perf_event *hwc;
1355    u64 interrupts, freq;
1356
1357    spin_lock(&ctx->lock);
1358    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1359        if (event->state != PERF_EVENT_STATE_ACTIVE)
1360            continue;
1361
1362        if (event->cpu != -1 && event->cpu != smp_processor_id())
1363            continue;
1364
1365        hwc = &event->hw;
1366
1367        interrupts = hwc->interrupts;
1368        hwc->interrupts = 0;
1369
1370        /*
1371         * unthrottle events on the tick
1372         */
1373        if (interrupts == MAX_INTERRUPTS) {
1374            perf_log_throttle(event, 1);
1375            event->pmu->unthrottle(event);
1376            interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1377        }
1378
1379        if (!event->attr.freq || !event->attr.sample_freq)
1380            continue;
1381
1382        /*
1383         * if the specified freq < HZ then we need to skip ticks
1384         */
1385        if (event->attr.sample_freq < HZ) {
1386            freq = event->attr.sample_freq;
1387
1388            hwc->freq_count += freq;
1389            hwc->freq_interrupts += interrupts;
1390
1391            if (hwc->freq_count < HZ)
1392                continue;
1393
1394            interrupts = hwc->freq_interrupts;
1395            hwc->freq_interrupts = 0;
1396            hwc->freq_count -= HZ;
1397        } else
1398            freq = HZ;
1399
1400        perf_adjust_period(event, freq * interrupts);
1401
1402        /*
1403         * In order to avoid being stalled by an (accidental) huge
1404         * sample period, force reset the sample period if we didn't
1405         * get any events in this freq period.
1406         */
1407        if (!interrupts) {
1408            perf_disable();
1409            event->pmu->disable(event);
1410            atomic64_set(&hwc->period_left, 0);
1411            event->pmu->enable(event);
1412            perf_enable();
1413        }
1414    }
1415    spin_unlock(&ctx->lock);
1416}
1417
1418/*
1419 * Round-robin a context's events:
1420 */
1421static void rotate_ctx(struct perf_event_context *ctx)
1422{
1423    struct perf_event *event;
1424
1425    if (!ctx->nr_events)
1426        return;
1427
1428    spin_lock(&ctx->lock);
1429    /*
1430     * Rotate the first entry last (works just fine for group events too):
1431     */
1432    perf_disable();
1433    list_for_each_entry(event, &ctx->group_list, group_entry) {
1434        list_move_tail(&event->group_entry, &ctx->group_list);
1435        break;
1436    }
1437    perf_enable();
1438
1439    spin_unlock(&ctx->lock);
1440}
1441
1442void perf_event_task_tick(struct task_struct *curr, int cpu)
1443{
1444    struct perf_cpu_context *cpuctx;
1445    struct perf_event_context *ctx;
1446
1447    if (!atomic_read(&nr_events))
1448        return;
1449
1450    cpuctx = &per_cpu(perf_cpu_context, cpu);
1451    ctx = curr->perf_event_ctxp;
1452
1453    perf_ctx_adjust_freq(&cpuctx->ctx);
1454    if (ctx)
1455        perf_ctx_adjust_freq(ctx);
1456
1457    perf_event_cpu_sched_out(cpuctx);
1458    if (ctx)
1459        __perf_event_task_sched_out(ctx);
1460
1461    rotate_ctx(&cpuctx->ctx);
1462    if (ctx)
1463        rotate_ctx(ctx);
1464
1465    perf_event_cpu_sched_in(cpuctx, cpu);
1466    if (ctx)
1467        perf_event_task_sched_in(curr, cpu);
1468}
1469
1470/*
1471 * Enable all of a task's events that have been marked enable-on-exec.
1472 * This expects task == current.
1473 */
1474static void perf_event_enable_on_exec(struct task_struct *task)
1475{
1476    struct perf_event_context *ctx;
1477    struct perf_event *event;
1478    unsigned long flags;
1479    int enabled = 0;
1480
1481    local_irq_save(flags);
1482    ctx = task->perf_event_ctxp;
1483    if (!ctx || !ctx->nr_events)
1484        goto out;
1485
1486    __perf_event_task_sched_out(ctx);
1487
1488    spin_lock(&ctx->lock);
1489
1490    list_for_each_entry(event, &ctx->group_list, group_entry) {
1491        if (!event->attr.enable_on_exec)
1492            continue;
1493        event->attr.enable_on_exec = 0;
1494        if (event->state >= PERF_EVENT_STATE_INACTIVE)
1495            continue;
1496        __perf_event_mark_enabled(event, ctx);
1497        enabled = 1;
1498    }
1499
1500    /*
1501     * Unclone this context if we enabled any event.
1502     */
1503    if (enabled)
1504        unclone_ctx(ctx);
1505
1506    spin_unlock(&ctx->lock);
1507
1508    perf_event_task_sched_in(task, smp_processor_id());
1509 out:
1510    local_irq_restore(flags);
1511}
1512
1513/*
1514 * Cross CPU call to read the hardware event
1515 */
1516static void __perf_event_read(void *info)
1517{
1518    struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1519    struct perf_event *event = info;
1520    struct perf_event_context *ctx = event->ctx;
1521    unsigned long flags;
1522
1523    /*
1524     * If this is a task context, we need to check whether it is
1525     * the current task context of this cpu. If not it has been
1526     * scheduled out before the smp call arrived. In that case
1527     * event->count would have been updated to a recent sample
1528     * when the event was scheduled out.
1529     */
1530    if (ctx->task && cpuctx->task_ctx != ctx)
1531        return;
1532
1533    local_irq_save(flags);
1534    if (ctx->is_active)
1535        update_context_time(ctx);
1536    event->pmu->read(event);
1537    update_event_times(event);
1538    local_irq_restore(flags);
1539}
1540
1541static u64 perf_event_read(struct perf_event *event)
1542{
1543    /*
1544     * If event is enabled and currently active on a CPU, update the
1545     * value in the event structure:
1546     */
1547    if (event->state == PERF_EVENT_STATE_ACTIVE) {
1548        smp_call_function_single(event->oncpu,
1549                     __perf_event_read, event, 1);
1550    } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1551        update_event_times(event);
1552    }
1553
1554    return atomic64_read(&event->count);
1555}
1556
1557/*
1558 * Initialize the perf_event context in a task_struct:
1559 */
1560static void
1561__perf_event_init_context(struct perf_event_context *ctx,
1562                struct task_struct *task)
1563{
1564    memset(ctx, 0, sizeof(*ctx));
1565    spin_lock_init(&ctx->lock);
1566    mutex_init(&ctx->mutex);
1567    INIT_LIST_HEAD(&ctx->group_list);
1568    INIT_LIST_HEAD(&ctx->event_list);
1569    atomic_set(&ctx->refcount, 1);
1570    ctx->task = task;
1571}
1572
1573static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1574{
1575    struct perf_event_context *ctx;
1576    struct perf_cpu_context *cpuctx;
1577    struct task_struct *task;
1578    unsigned long flags;
1579    int err;
1580
1581    /*
1582     * If cpu is not a wildcard then this is a percpu event:
1583     */
1584    if (cpu != -1) {
1585        /* Must be root to operate on a CPU event: */
1586        if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1587            return ERR_PTR(-EACCES);
1588
1589        if (cpu < 0 || cpu >= nr_cpumask_bits)
1590            return ERR_PTR(-EINVAL);
1591
1592        /*
1593         * We could be clever and allow to attach a event to an
1594         * offline CPU and activate it when the CPU comes up, but
1595         * that's for later.
1596         */
1597        if (!cpu_isset(cpu, cpu_online_map))
1598            return ERR_PTR(-ENODEV);
1599
1600        cpuctx = &per_cpu(perf_cpu_context, cpu);
1601        ctx = &cpuctx->ctx;
1602        get_ctx(ctx);
1603
1604        return ctx;
1605    }
1606
1607    rcu_read_lock();
1608    if (!pid)
1609        task = current;
1610    else
1611        task = find_task_by_vpid(pid);
1612    if (task)
1613        get_task_struct(task);
1614    rcu_read_unlock();
1615
1616    if (!task)
1617        return ERR_PTR(-ESRCH);
1618
1619    /*
1620     * Can't attach events to a dying task.
1621     */
1622    err = -ESRCH;
1623    if (task->flags & PF_EXITING)
1624        goto errout;
1625
1626    /* Reuse ptrace permission checks for now. */
1627    err = -EACCES;
1628    if (!ptrace_may_access(task, PTRACE_MODE_READ))
1629        goto errout;
1630
1631 retry:
1632    ctx = perf_lock_task_context(task, &flags);
1633    if (ctx) {
1634        unclone_ctx(ctx);
1635        spin_unlock_irqrestore(&ctx->lock, flags);
1636    }
1637
1638    if (!ctx) {
1639        ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1640        err = -ENOMEM;
1641        if (!ctx)
1642            goto errout;
1643        __perf_event_init_context(ctx, task);
1644        get_ctx(ctx);
1645        if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1646            /*
1647             * We raced with some other task; use
1648             * the context they set.
1649             */
1650            kfree(ctx);
1651            goto retry;
1652        }
1653        get_task_struct(task);
1654    }
1655
1656    put_task_struct(task);
1657    return ctx;
1658
1659 errout:
1660    put_task_struct(task);
1661    return ERR_PTR(err);
1662}
1663
1664static void free_event_rcu(struct rcu_head *head)
1665{
1666    struct perf_event *event;
1667
1668    event = container_of(head, struct perf_event, rcu_head);
1669    if (event->ns)
1670        put_pid_ns(event->ns);
1671    kfree(event);
1672}
1673
1674static void perf_pending_sync(struct perf_event *event);
1675
1676static void free_event(struct perf_event *event)
1677{
1678    perf_pending_sync(event);
1679
1680    if (!event->parent) {
1681        atomic_dec(&nr_events);
1682        if (event->attr.mmap)
1683            atomic_dec(&nr_mmap_events);
1684        if (event->attr.comm)
1685            atomic_dec(&nr_comm_events);
1686        if (event->attr.task)
1687            atomic_dec(&nr_task_events);
1688    }
1689
1690    if (event->output) {
1691        fput(event->output->filp);
1692        event->output = NULL;
1693    }
1694
1695    if (event->destroy)
1696        event->destroy(event);
1697
1698    put_ctx(event->ctx);
1699    call_rcu(&event->rcu_head, free_event_rcu);
1700}
1701
1702/*
1703 * Called when the last reference to the file is gone.
1704 */
1705static int perf_release(struct inode *inode, struct file *file)
1706{
1707    struct perf_event *event = file->private_data;
1708    struct perf_event_context *ctx = event->ctx;
1709
1710    file->private_data = NULL;
1711
1712    WARN_ON_ONCE(ctx->parent_ctx);
1713    mutex_lock(&ctx->mutex);
1714    perf_event_remove_from_context(event);
1715    mutex_unlock(&ctx->mutex);
1716
1717    mutex_lock(&event->owner->perf_event_mutex);
1718    list_del_init(&event->owner_entry);
1719    mutex_unlock(&event->owner->perf_event_mutex);
1720    put_task_struct(event->owner);
1721
1722    free_event(event);
1723
1724    return 0;
1725}
1726
1727static int perf_event_read_size(struct perf_event *event)
1728{
1729    int entry = sizeof(u64); /* value */
1730    int size = 0;
1731    int nr = 1;
1732
1733    if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1734        size += sizeof(u64);
1735
1736    if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1737        size += sizeof(u64);
1738
1739    if (event->attr.read_format & PERF_FORMAT_ID)
1740        entry += sizeof(u64);
1741
1742    if (event->attr.read_format & PERF_FORMAT_GROUP) {
1743        nr += event->group_leader->nr_siblings;
1744        size += sizeof(u64);
1745    }
1746
1747    size += entry * nr;
1748
1749    return size;
1750}
1751
1752static u64 perf_event_read_value(struct perf_event *event)
1753{
1754    struct perf_event *child;
1755    u64 total = 0;
1756
1757    total += perf_event_read(event);
1758    list_for_each_entry(child, &event->child_list, child_list)
1759        total += perf_event_read(child);
1760
1761    return total;
1762}
1763
1764static int perf_event_read_entry(struct perf_event *event,
1765                   u64 read_format, char __user *buf)
1766{
1767    int n = 0, count = 0;
1768    u64 values[2];
1769
1770    values[n++] = perf_event_read_value(event);
1771    if (read_format & PERF_FORMAT_ID)
1772        values[n++] = primary_event_id(event);
1773
1774    count = n * sizeof(u64);
1775
1776    if (copy_to_user(buf, values, count))
1777        return -EFAULT;
1778
1779    return count;
1780}
1781
1782static int perf_event_read_group(struct perf_event *event,
1783                   u64 read_format, char __user *buf)
1784{
1785    struct perf_event *leader = event->group_leader, *sub;
1786    int n = 0, size = 0, err = -EFAULT;
1787    u64 values[3];
1788
1789    values[n++] = 1 + leader->nr_siblings;
1790    if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1791        values[n++] = leader->total_time_enabled +
1792            atomic64_read(&leader->child_total_time_enabled);
1793    }
1794    if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1795        values[n++] = leader->total_time_running +
1796            atomic64_read(&leader->child_total_time_running);
1797    }
1798
1799    size = n * sizeof(u64);
1800
1801    if (copy_to_user(buf, values, size))
1802        return -EFAULT;
1803
1804    err = perf_event_read_entry(leader, read_format, buf + size);
1805    if (err < 0)
1806        return err;
1807
1808    size += err;
1809
1810    list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1811        err = perf_event_read_entry(sub, read_format,
1812                buf + size);
1813        if (err < 0)
1814            return err;
1815
1816        size += err;
1817    }
1818
1819    return size;
1820}
1821
1822static int perf_event_read_one(struct perf_event *event,
1823                 u64 read_format, char __user *buf)
1824{
1825    u64 values[4];
1826    int n = 0;
1827
1828    values[n++] = perf_event_read_value(event);
1829    if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1830        values[n++] = event->total_time_enabled +
1831            atomic64_read(&event->child_total_time_enabled);
1832    }
1833    if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1834        values[n++] = event->total_time_running +
1835            atomic64_read(&event->child_total_time_running);
1836    }
1837    if (read_format & PERF_FORMAT_ID)
1838        values[n++] = primary_event_id(event);
1839
1840    if (copy_to_user(buf, values, n * sizeof(u64)))
1841        return -EFAULT;
1842
1843    return n * sizeof(u64);
1844}
1845
1846/*
1847 * Read the performance event - simple non blocking version for now
1848 */
1849static ssize_t
1850perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1851{
1852    u64 read_format = event->attr.read_format;
1853    int ret;
1854
1855    /*
1856     * Return end-of-file for a read on a event that is in
1857     * error state (i.e. because it was pinned but it couldn't be
1858     * scheduled on to the CPU at some point).
1859     */
1860    if (event->state == PERF_EVENT_STATE_ERROR)
1861        return 0;
1862
1863    if (count < perf_event_read_size(event))
1864        return -ENOSPC;
1865
1866    WARN_ON_ONCE(event->ctx->parent_ctx);
1867    mutex_lock(&event->child_mutex);
1868    if (read_format & PERF_FORMAT_GROUP)
1869        ret = perf_event_read_group(event, read_format, buf);
1870    else
1871        ret = perf_event_read_one(event, read_format, buf);
1872    mutex_unlock(&event->child_mutex);
1873
1874    return ret;
1875}
1876
1877static ssize_t
1878perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1879{
1880    struct perf_event *event = file->private_data;
1881
1882    return perf_read_hw(event, buf, count);
1883}
1884
1885static unsigned int perf_poll(struct file *file, poll_table *wait)
1886{
1887    struct perf_event *event = file->private_data;
1888    struct perf_mmap_data *data;
1889    unsigned int events = POLL_HUP;
1890
1891    rcu_read_lock();
1892    data = rcu_dereference(event->data);
1893    if (data)
1894        events = atomic_xchg(&data->poll, 0);
1895    rcu_read_unlock();
1896
1897    poll_wait(file, &event->waitq, wait);
1898
1899    return events;
1900}
1901
1902static void perf_event_reset(struct perf_event *event)
1903{
1904    (void)perf_event_read(event);
1905    atomic64_set(&event->count, 0);
1906    perf_event_update_userpage(event);
1907}
1908
1909/*
1910 * Holding the top-level event's child_mutex means that any
1911 * descendant process that has inherited this event will block
1912 * in sync_child_event if it goes to exit, thus satisfying the
1913 * task existence requirements of perf_event_enable/disable.
1914 */
1915static void perf_event_for_each_child(struct perf_event *event,
1916                    void (*func)(struct perf_event *))
1917{
1918    struct perf_event *child;
1919
1920    WARN_ON_ONCE(event->ctx->parent_ctx);
1921    mutex_lock(&event->child_mutex);
1922    func(event);
1923    list_for_each_entry(child, &event->child_list, child_list)
1924        func(child);
1925    mutex_unlock(&event->child_mutex);
1926}
1927
1928static void perf_event_for_each(struct perf_event *event,
1929                  void (*func)(struct perf_event *))
1930{
1931    struct perf_event_context *ctx = event->ctx;
1932    struct perf_event *sibling;
1933
1934    WARN_ON_ONCE(ctx->parent_ctx);
1935    mutex_lock(&ctx->mutex);
1936    event = event->group_leader;
1937
1938    perf_event_for_each_child(event, func);
1939    func(event);
1940    list_for_each_entry(sibling, &event->sibling_list, group_entry)
1941        perf_event_for_each_child(event, func);
1942    mutex_unlock(&ctx->mutex);
1943}
1944
1945static int perf_event_period(struct perf_event *event, u64 __user *arg)
1946{
1947    struct perf_event_context *ctx = event->ctx;
1948    unsigned long size;
1949    int ret = 0;
1950    u64 value;
1951
1952    if (!event->attr.sample_period)
1953        return -EINVAL;
1954
1955    size = copy_from_user(&value, arg, sizeof(value));
1956    if (size != sizeof(value))
1957        return -EFAULT;
1958
1959    if (!value)
1960        return -EINVAL;
1961
1962    spin_lock_irq(&ctx->lock);
1963    if (event->attr.freq) {
1964        if (value > sysctl_perf_event_sample_rate) {
1965            ret = -EINVAL;
1966            goto unlock;
1967        }
1968
1969        event->attr.sample_freq = value;
1970    } else {
1971        event->attr.sample_period = value;
1972        event->hw.sample_period = value;
1973    }
1974unlock:
1975    spin_unlock_irq(&ctx->lock);
1976
1977    return ret;
1978}
1979
1980int perf_event_set_output(struct perf_event *event, int output_fd);
1981
1982static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1983{
1984    struct perf_event *event = file->private_data;
1985    void (*func)(struct perf_event *);
1986    u32 flags = arg;
1987
1988    switch (cmd) {
1989    case PERF_EVENT_IOC_ENABLE:
1990        func = perf_event_enable;
1991        break;
1992    case PERF_EVENT_IOC_DISABLE:
1993        func = perf_event_disable;
1994        break;
1995    case PERF_EVENT_IOC_RESET:
1996        func = perf_event_reset;
1997        break;
1998
1999    case PERF_EVENT_IOC_REFRESH:
2000        return perf_event_refresh(event, arg);
2001
2002    case PERF_EVENT_IOC_PERIOD:
2003        return perf_event_period(event, (u64 __user *)arg);
2004
2005    case PERF_EVENT_IOC_SET_OUTPUT:
2006        return perf_event_set_output(event, arg);
2007
2008    default:
2009        return -ENOTTY;
2010    }
2011
2012    if (flags & PERF_IOC_FLAG_GROUP)
2013        perf_event_for_each(event, func);
2014    else
2015        perf_event_for_each_child(event, func);
2016
2017    return 0;
2018}
2019
2020int perf_event_task_enable(void)
2021{
2022    struct perf_event *event;
2023
2024    mutex_lock(&current->perf_event_mutex);
2025    list_for_each_entry(event, &current->perf_event_list, owner_entry)
2026        perf_event_for_each_child(event, perf_event_enable);
2027    mutex_unlock(&current->perf_event_mutex);
2028
2029    return 0;
2030}
2031
2032int perf_event_task_disable(void)
2033{
2034    struct perf_event *event;
2035
2036    mutex_lock(&current->perf_event_mutex);
2037    list_for_each_entry(event, &current->perf_event_list, owner_entry)
2038        perf_event_for_each_child(event, perf_event_disable);
2039    mutex_unlock(&current->perf_event_mutex);
2040
2041    return 0;
2042}
2043
2044#ifndef PERF_EVENT_INDEX_OFFSET
2045# define PERF_EVENT_INDEX_OFFSET 0
2046#endif
2047
2048static int perf_event_index(struct perf_event *event)
2049{
2050    if (event->state != PERF_EVENT_STATE_ACTIVE)
2051        return 0;
2052
2053    return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2054}
2055
2056/*
2057 * Callers need to ensure there can be no nesting of this function, otherwise
2058 * the seqlock logic goes bad. We can not serialize this because the arch
2059 * code calls this from NMI context.
2060 */
2061void perf_event_update_userpage(struct perf_event *event)
2062{
2063    struct perf_event_mmap_page *userpg;
2064    struct perf_mmap_data *data;
2065
2066    rcu_read_lock();
2067    data = rcu_dereference(event->data);
2068    if (!data)
2069        goto unlock;
2070
2071    userpg = data->user_page;
2072
2073    /*
2074     * Disable preemption so as to not let the corresponding user-space
2075     * spin too long if we get preempted.
2076     */
2077    preempt_disable();
2078    ++userpg->lock;
2079    barrier();
2080    userpg->index = perf_event_index(event);
2081    userpg->offset = atomic64_read(&event->count);
2082    if (event->state == PERF_EVENT_STATE_ACTIVE)
2083        userpg->offset -= atomic64_read(&event->hw.prev_count);
2084
2085    userpg->time_enabled = event->total_time_enabled +
2086            atomic64_read(&event->child_total_time_enabled);
2087
2088    userpg->time_running = event->total_time_running +
2089            atomic64_read(&event->child_total_time_running);
2090
2091    barrier();
2092    ++userpg->lock;
2093    preempt_enable();
2094unlock:
2095    rcu_read_unlock();
2096}
2097
2098static unsigned long perf_data_size(struct perf_mmap_data *data)
2099{
2100    return data->nr_pages << (PAGE_SHIFT + data->data_order);
2101}
2102
2103#ifndef CONFIG_PERF_USE_VMALLOC
2104
2105/*
2106 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2107 */
2108
2109static struct page *
2110perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2111{
2112    if (pgoff > data->nr_pages)
2113        return NULL;
2114
2115    if (pgoff == 0)
2116        return virt_to_page(data->user_page);
2117
2118    return virt_to_page(data->data_pages[pgoff - 1]);
2119}
2120
2121static struct perf_mmap_data *
2122perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2123{
2124    struct perf_mmap_data *data;
2125    unsigned long size;
2126    int i;
2127
2128    WARN_ON(atomic_read(&event->mmap_count));
2129
2130    size = sizeof(struct perf_mmap_data);
2131    size += nr_pages * sizeof(void *);
2132
2133    data = kzalloc(size, GFP_KERNEL);
2134    if (!data)
2135        goto fail;
2136
2137    data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2138    if (!data->user_page)
2139        goto fail_user_page;
2140
2141    for (i = 0; i < nr_pages; i++) {
2142        data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2143        if (!data->data_pages[i])
2144            goto fail_data_pages;
2145    }
2146
2147    data->data_order = 0;
2148    data->nr_pages = nr_pages;
2149
2150    return data;
2151
2152fail_data_pages:
2153    for (i--; i >= 0; i--)
2154        free_page((unsigned long)data->data_pages[i]);
2155
2156    free_page((unsigned long)data->user_page);
2157
2158fail_user_page:
2159    kfree(data);
2160
2161fail:
2162    return NULL;
2163}
2164
2165static void perf_mmap_free_page(unsigned long addr)
2166{
2167    struct page *page = virt_to_page((void *)addr);
2168
2169    page->mapping = NULL;
2170    __free_page(page);
2171}
2172
2173static void perf_mmap_data_free(struct perf_mmap_data *data)
2174{
2175    int i;
2176
2177    perf_mmap_free_page((unsigned long)data->user_page);
2178    for (i = 0; i < data->nr_pages; i++)
2179        perf_mmap_free_page((unsigned long)data->data_pages[i]);
2180    kfree(data);
2181}
2182
2183#else
2184
2185/*
2186 * Back perf_mmap() with vmalloc memory.
2187 *
2188 * Required for architectures that have d-cache aliasing issues.
2189 */
2190
2191static struct page *
2192perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2193{
2194    if (pgoff > (1UL << data->data_order))
2195        return NULL;
2196
2197    return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2198}
2199
2200static void perf_mmap_unmark_page(void *addr)
2201{
2202    struct page *page = vmalloc_to_page(addr);
2203
2204    page->mapping = NULL;
2205}
2206
2207static void perf_mmap_data_free_work(struct work_struct *work)
2208{
2209    struct perf_mmap_data *data;
2210    void *base;
2211    int i, nr;
2212
2213    data = container_of(work, struct perf_mmap_data, work);
2214    nr = 1 << data->data_order;
2215
2216    base = data->user_page;
2217    for (i = 0; i < nr + 1; i++)
2218        perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2219
2220    vfree(base);
2221    kfree(data);
2222}
2223
2224static void perf_mmap_data_free(struct perf_mmap_data *data)
2225{
2226    schedule_work(&data->work);
2227}
2228
2229static struct perf_mmap_data *
2230perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2231{
2232    struct perf_mmap_data *data;
2233    unsigned long size;
2234    void *all_buf;
2235
2236    WARN_ON(atomic_read(&event->mmap_count));
2237
2238    size = sizeof(struct perf_mmap_data);
2239    size += sizeof(void *);
2240
2241    data = kzalloc(size, GFP_KERNEL);
2242    if (!data)
2243        goto fail;
2244
2245    INIT_WORK(&data->work, perf_mmap_data_free_work);
2246
2247    all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2248    if (!all_buf)
2249        goto fail_all_buf;
2250
2251    data->user_page = all_buf;
2252    data->data_pages[0] = all_buf + PAGE_SIZE;
2253    data->data_order = ilog2(nr_pages);
2254    data->nr_pages = 1;
2255
2256    return data;
2257
2258fail_all_buf:
2259    kfree(data);
2260
2261fail:
2262    return NULL;
2263}
2264
2265#endif
2266
2267static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2268{
2269    struct perf_event *event = vma->vm_file->private_data;
2270    struct perf_mmap_data *data;
2271    int ret = VM_FAULT_SIGBUS;
2272
2273    if (vmf->flags & FAULT_FLAG_MKWRITE) {
2274        if (vmf->pgoff == 0)
2275            ret = 0;
2276        return ret;
2277    }
2278
2279    rcu_read_lock();
2280    data = rcu_dereference(event->data);
2281    if (!data)
2282        goto unlock;
2283
2284    if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2285        goto unlock;
2286
2287    vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2288    if (!vmf->page)
2289        goto unlock;
2290
2291    get_page(vmf->page);
2292    vmf->page->mapping = vma->vm_file->f_mapping;
2293    vmf->page->index = vmf->pgoff;
2294
2295    ret = 0;
2296unlock:
2297    rcu_read_unlock();
2298
2299    return ret;
2300}
2301
2302static void
2303perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2304{
2305    long max_size = perf_data_size(data);
2306
2307    atomic_set(&data->lock, -1);
2308
2309    if (event->attr.watermark) {
2310        data->watermark = min_t(long, max_size,
2311                    event->attr.wakeup_watermark);
2312    }
2313
2314    if (!data->watermark)
2315        data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
2316
2317
2318    rcu_assign_pointer(event->data, data);
2319}
2320
2321static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2322{
2323    struct perf_mmap_data *data;
2324
2325    data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2326    perf_mmap_data_free(data);
2327}
2328
2329static void perf_mmap_data_release(struct perf_event *event)
2330{
2331    struct perf_mmap_data *data = event->data;
2332
2333    WARN_ON(atomic_read(&event->mmap_count));
2334
2335    rcu_assign_pointer(event->data, NULL);
2336    call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2337}
2338
2339static void perf_mmap_open(struct vm_area_struct *vma)
2340{
2341    struct perf_event *event = vma->vm_file->private_data;
2342
2343    atomic_inc(&event->mmap_count);
2344}
2345
2346static void perf_mmap_close(struct vm_area_struct *vma)
2347{
2348    struct perf_event *event = vma->vm_file->private_data;
2349
2350    WARN_ON_ONCE(event->ctx->parent_ctx);
2351    if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2352        unsigned long size = perf_data_size(event->data);
2353        struct user_struct *user = current_user();
2354
2355        atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2356        vma->vm_mm->locked_vm -= event->data->nr_locked;
2357        perf_mmap_data_release(event);
2358        mutex_unlock(&event->mmap_mutex);
2359    }
2360}
2361
2362static const struct vm_operations_struct perf_mmap_vmops = {
2363    .open = perf_mmap_open,
2364    .close = perf_mmap_close,
2365    .fault = perf_mmap_fault,
2366    .page_mkwrite = perf_mmap_fault,
2367};
2368
2369static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2370{
2371    struct perf_event *event = file->private_data;
2372    unsigned long user_locked, user_lock_limit;
2373    struct user_struct *user = current_user();
2374    unsigned long locked, lock_limit;
2375    struct perf_mmap_data *data;
2376    unsigned long vma_size;
2377    unsigned long nr_pages;
2378    long user_extra, extra;
2379    int ret = 0;
2380
2381    if (!(vma->vm_flags & VM_SHARED))
2382        return -EINVAL;
2383
2384    vma_size = vma->vm_end - vma->vm_start;
2385    nr_pages = (vma_size / PAGE_SIZE) - 1;
2386
2387    /*
2388     * If we have data pages ensure they're a power-of-two number, so we
2389     * can do bitmasks instead of modulo.
2390     */
2391    if (nr_pages != 0 && !is_power_of_2(nr_pages))
2392        return -EINVAL;
2393
2394    if (vma_size != PAGE_SIZE * (1 + nr_pages))
2395        return -EINVAL;
2396
2397    if (vma->vm_pgoff != 0)
2398        return -EINVAL;
2399
2400    WARN_ON_ONCE(event->ctx->parent_ctx);
2401    mutex_lock(&event->mmap_mutex);
2402    if (event->output) {
2403        ret = -EINVAL;
2404        goto unlock;
2405    }
2406
2407    if (atomic_inc_not_zero(&event->mmap_count)) {
2408        if (nr_pages != event->data->nr_pages)
2409            ret = -EINVAL;
2410        goto unlock;
2411    }
2412
2413    user_extra = nr_pages + 1;
2414    user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2415
2416    /*
2417     * Increase the limit linearly with more CPUs:
2418     */
2419    user_lock_limit *= num_online_cpus();
2420
2421    user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2422
2423    extra = 0;
2424    if (user_locked > user_lock_limit)
2425        extra = user_locked - user_lock_limit;
2426
2427    lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2428    lock_limit >>= PAGE_SHIFT;
2429    locked = vma->vm_mm->locked_vm + extra;
2430
2431    if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2432        !capable(CAP_IPC_LOCK)) {
2433        ret = -EPERM;
2434        goto unlock;
2435    }
2436
2437    WARN_ON(event->data);
2438
2439    data = perf_mmap_data_alloc(event, nr_pages);
2440    ret = -ENOMEM;
2441    if (!data)
2442        goto unlock;
2443
2444    ret = 0;
2445    perf_mmap_data_init(event, data);
2446
2447    atomic_set(&event->mmap_count, 1);
2448    atomic_long_add(user_extra, &user->locked_vm);
2449    vma->vm_mm->locked_vm += extra;
2450    event->data->nr_locked = extra;
2451    if (vma->vm_flags & VM_WRITE)
2452        event->data->writable = 1;
2453
2454unlock:
2455    mutex_unlock(&event->mmap_mutex);
2456
2457    vma->vm_flags |= VM_RESERVED;
2458    vma->vm_ops = &perf_mmap_vmops;
2459
2460    return ret;
2461}
2462
2463static int perf_fasync(int fd, struct file *filp, int on)
2464{
2465    struct inode *inode = filp->f_path.dentry->d_inode;
2466    struct perf_event *event = filp->private_data;
2467    int retval;
2468
2469    mutex_lock(&inode->i_mutex);
2470    retval = fasync_helper(fd, filp, on, &event->fasync);
2471    mutex_unlock(&inode->i_mutex);
2472
2473    if (retval < 0)
2474        return retval;
2475
2476    return 0;
2477}
2478
2479static const struct file_operations perf_fops = {
2480    .release = perf_release,
2481    .read = perf_read,
2482    .poll = perf_poll,
2483    .unlocked_ioctl = perf_ioctl,
2484    .compat_ioctl = perf_ioctl,
2485    .mmap = perf_mmap,
2486    .fasync = perf_fasync,
2487};
2488
2489/*
2490 * Perf event wakeup
2491 *
2492 * If there's data, ensure we set the poll() state and publish everything
2493 * to user-space before waking everybody up.
2494 */
2495
2496void perf_event_wakeup(struct perf_event *event)
2497{
2498    wake_up_all(&event->waitq);
2499
2500    if (event->pending_kill) {
2501        kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2502        event->pending_kill = 0;
2503    }
2504}
2505
2506/*
2507 * Pending wakeups
2508 *
2509 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2510 *
2511 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2512 * single linked list and use cmpxchg() to add entries lockless.
2513 */
2514
2515static void perf_pending_event(struct perf_pending_entry *entry)
2516{
2517    struct perf_event *event = container_of(entry,
2518            struct perf_event, pending);
2519
2520    if (event->pending_disable) {
2521        event->pending_disable = 0;
2522        __perf_event_disable(event);
2523    }
2524
2525    if (event->pending_wakeup) {
2526        event->pending_wakeup = 0;
2527        perf_event_wakeup(event);
2528    }
2529}
2530
2531#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2532
2533static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2534    PENDING_TAIL,
2535};
2536
2537static void perf_pending_queue(struct perf_pending_entry *entry,
2538                   void (*func)(struct perf_pending_entry *))
2539{
2540    struct perf_pending_entry **head;
2541
2542    if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2543        return;
2544
2545    entry->func = func;
2546
2547    head = &get_cpu_var(perf_pending_head);
2548
2549    do {
2550        entry->next = *head;
2551    } while (cmpxchg(head, entry->next, entry) != entry->next);
2552
2553    set_perf_event_pending();
2554
2555    put_cpu_var(perf_pending_head);
2556}
2557
2558static int __perf_pending_run(void)
2559{
2560    struct perf_pending_entry *list;
2561    int nr = 0;
2562
2563    list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2564    while (list != PENDING_TAIL) {
2565        void (*func)(struct perf_pending_entry *);
2566        struct perf_pending_entry *entry = list;
2567
2568        list = list->next;
2569
2570        func = entry->func;
2571        entry->next = NULL;
2572        /*
2573         * Ensure we observe the unqueue before we issue the wakeup,
2574         * so that we won't be waiting forever.
2575         * -- see perf_not_pending().
2576         */
2577        smp_wmb();
2578
2579        func(entry);
2580        nr++;
2581    }
2582
2583    return nr;
2584}
2585
2586static inline int perf_not_pending(struct perf_event *event)
2587{
2588    /*
2589     * If we flush on whatever cpu we run, there is a chance we don't
2590     * need to wait.
2591     */
2592    get_cpu();
2593    __perf_pending_run();
2594    put_cpu();
2595
2596    /*
2597     * Ensure we see the proper queue state before going to sleep
2598     * so that we do not miss the wakeup. -- see perf_pending_handle()
2599     */
2600    smp_rmb();
2601    return event->pending.next == NULL;
2602}
2603
2604static void perf_pending_sync(struct perf_event *event)
2605{
2606    wait_event(event->waitq, perf_not_pending(event));
2607}
2608
2609void perf_event_do_pending(void)
2610{
2611    __perf_pending_run();
2612}
2613
2614/*
2615 * Callchain support -- arch specific
2616 */
2617
2618__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2619{
2620    return NULL;
2621}
2622
2623/*
2624 * Output
2625 */
2626static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2627                  unsigned long offset, unsigned long head)
2628{
2629    unsigned long mask;
2630
2631    if (!data->writable)
2632        return true;
2633
2634    mask = perf_data_size(data) - 1;
2635
2636    offset = (offset - tail) & mask;
2637    head = (head - tail) & mask;
2638
2639    if ((int)(head - offset) < 0)
2640        return false;
2641
2642    return true;
2643}
2644
2645static void perf_output_wakeup(struct perf_output_handle *handle)
2646{
2647    atomic_set(&handle->data->poll, POLL_IN);
2648
2649    if (handle->nmi) {
2650        handle->event->pending_wakeup = 1;
2651        perf_pending_queue(&handle->event->pending,
2652                   perf_pending_event);
2653    } else
2654        perf_event_wakeup(handle->event);
2655}
2656
2657/*
2658 * Curious locking construct.
2659 *
2660 * We need to ensure a later event_id doesn't publish a head when a former
2661 * event_id isn't done writing. However since we need to deal with NMIs we
2662 * cannot fully serialize things.
2663 *
2664 * What we do is serialize between CPUs so we only have to deal with NMI
2665 * nesting on a single CPU.
2666 *
2667 * We only publish the head (and generate a wakeup) when the outer-most
2668 * event_id completes.
2669 */
2670static void perf_output_lock(struct perf_output_handle *handle)
2671{
2672    struct perf_mmap_data *data = handle->data;
2673    int cpu;
2674
2675    handle->locked = 0;
2676
2677    local_irq_save(handle->flags);
2678    cpu = smp_processor_id();
2679
2680    if (in_nmi() && atomic_read(&data->lock) == cpu)
2681        return;
2682
2683    while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2684        cpu_relax();
2685
2686    handle->locked = 1;
2687}
2688
2689static void perf_output_unlock(struct perf_output_handle *handle)
2690{
2691    struct perf_mmap_data *data = handle->data;
2692    unsigned long head;
2693    int cpu;
2694
2695    data->done_head = data->head;
2696
2697    if (!handle->locked)
2698        goto out;
2699
2700again:
2701    /*
2702     * The xchg implies a full barrier that ensures all writes are done
2703     * before we publish the new head, matched by a rmb() in userspace when
2704     * reading this position.
2705     */
2706    while ((head = atomic_long_xchg(&data->done_head, 0)))
2707        data->user_page->data_head = head;
2708
2709    /*
2710     * NMI can happen here, which means we can miss a done_head update.
2711     */
2712
2713    cpu = atomic_xchg(&data->lock, -1);
2714    WARN_ON_ONCE(cpu != smp_processor_id());
2715
2716    /*
2717     * Therefore we have to validate we did not indeed do so.
2718     */
2719    if (unlikely(atomic_long_read(&data->done_head))) {
2720        /*
2721         * Since we had it locked, we can lock it again.
2722         */
2723        while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2724            cpu_relax();
2725
2726        goto again;
2727    }
2728
2729    if (atomic_xchg(&data->wakeup, 0))
2730        perf_output_wakeup(handle);
2731out:
2732    local_irq_restore(handle->flags);
2733}
2734
2735void perf_output_copy(struct perf_output_handle *handle,
2736              const void *buf, unsigned int len)
2737{
2738    unsigned int pages_mask;
2739    unsigned long offset;
2740    unsigned int size;
2741    void **pages;
2742
2743    offset = handle->offset;
2744    pages_mask = handle->data->nr_pages - 1;
2745    pages = handle->data->data_pages;
2746
2747    do {
2748        unsigned long page_offset;
2749        unsigned long page_size;
2750        int nr;
2751
2752        nr = (offset >> PAGE_SHIFT) & pages_mask;
2753        page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2754        page_offset = offset & (page_size - 1);
2755        size = min_t(unsigned int, page_size - page_offset, len);
2756
2757        memcpy(pages[nr] + page_offset, buf, size);
2758
2759        len -= size;
2760        buf += size;
2761        offset += size;
2762    } while (len);
2763
2764    handle->offset = offset;
2765
2766    /*
2767     * Check we didn't copy past our reservation window, taking the
2768     * possible unsigned int wrap into account.
2769     */
2770    WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2771}
2772
2773int perf_output_begin(struct perf_output_handle *handle,
2774              struct perf_event *event, unsigned int size,
2775              int nmi, int sample)
2776{
2777    struct perf_event *output_event;
2778    struct perf_mmap_data *data;
2779    unsigned long tail, offset, head;
2780    int have_lost;
2781    struct {
2782        struct perf_event_header header;
2783        u64 id;
2784        u64 lost;
2785    } lost_event;
2786
2787    rcu_read_lock();
2788    /*
2789     * For inherited events we send all the output towards the parent.
2790     */
2791    if (event->parent)
2792        event = event->parent;
2793
2794    output_event = rcu_dereference(event->output);
2795    if (output_event)
2796        event = output_event;
2797
2798    data = rcu_dereference(event->data);
2799    if (!data)
2800        goto out;
2801
2802    handle->data = data;
2803    handle->event = event;
2804    handle->nmi = nmi;
2805    handle->sample = sample;
2806
2807    if (!data->nr_pages)
2808        goto fail;
2809
2810    have_lost = atomic_read(&data->lost);
2811    if (have_lost)
2812        size += sizeof(lost_event);
2813
2814    perf_output_lock(handle);
2815
2816    do {
2817        /*
2818         * Userspace could choose to issue a mb() before updating the
2819         * tail pointer. So that all reads will be completed before the
2820         * write is issued.
2821         */
2822        tail = ACCESS_ONCE(data->user_page->data_tail);
2823        smp_rmb();
2824        offset = head = atomic_long_read(&data->head);
2825        head += size;
2826        if (unlikely(!perf_output_space(data, tail, offset, head)))
2827            goto fail;
2828    } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2829
2830    handle->offset = offset;
2831    handle->head = head;
2832
2833    if (head - tail > data->watermark)
2834        atomic_set(&data->wakeup, 1);
2835
2836    if (have_lost) {
2837        lost_event.header.type = PERF_RECORD_LOST;
2838        lost_event.header.misc = 0;
2839        lost_event.header.size = sizeof(lost_event);
2840        lost_event.id = event->id;
2841        lost_event.lost = atomic_xchg(&data->lost, 0);
2842
2843        perf_output_put(handle, lost_event);
2844    }
2845
2846    return 0;
2847
2848fail:
2849    atomic_inc(&data->lost);
2850    perf_output_unlock(handle);
2851out:
2852    rcu_read_unlock();
2853
2854    return -ENOSPC;
2855}
2856
2857void perf_output_end(struct perf_output_handle *handle)
2858{
2859    struct perf_event *event = handle->event;
2860    struct perf_mmap_data *data = handle->data;
2861
2862    int wakeup_events = event->attr.wakeup_events;
2863
2864    if (handle->sample && wakeup_events) {
2865        int events = atomic_inc_return(&data->events);
2866        if (events >= wakeup_events) {
2867            atomic_sub(wakeup_events, &data->events);
2868            atomic_set(&data->wakeup, 1);
2869        }
2870    }
2871
2872    perf_output_unlock(handle);
2873    rcu_read_unlock();
2874}
2875
2876static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2877{
2878    /*
2879     * only top level events have the pid namespace they were created in
2880     */
2881    if (event->parent)
2882        event = event->parent;
2883
2884    return task_tgid_nr_ns(p, event->ns);
2885}
2886
2887static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2888{
2889    /*
2890     * only top level events have the pid namespace they were created in
2891     */
2892    if (event->parent)
2893        event = event->parent;
2894
2895    return task_pid_nr_ns(p, event->ns);
2896}
2897
2898static void perf_output_read_one(struct perf_output_handle *handle,
2899                 struct perf_event *event)
2900{
2901    u64 read_format = event->attr.read_format;
2902    u64 values[4];
2903    int n = 0;
2904
2905    values[n++] = atomic64_read(&event->count);
2906    if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2907        values[n++] = event->total_time_enabled +
2908            atomic64_read(&event->child_total_time_enabled);
2909    }
2910    if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2911        values[n++] = event->total_time_running +
2912            atomic64_read(&event->child_total_time_running);
2913    }
2914    if (read_format & PERF_FORMAT_ID)
2915        values[n++] = primary_event_id(event);
2916
2917    perf_output_copy(handle, values, n * sizeof(u64));
2918}
2919
2920/*
2921 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2922 */
2923static void perf_output_read_group(struct perf_output_handle *handle,
2924                struct perf_event *event)
2925{
2926    struct perf_event *leader = event->group_leader, *sub;
2927    u64 read_format = event->attr.read_format;
2928    u64 values[5];
2929    int n = 0;
2930
2931    values[n++] = 1 + leader->nr_siblings;
2932
2933    if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2934        values[n++] = leader->total_time_enabled;
2935
2936    if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2937        values[n++] = leader->total_time_running;
2938
2939    if (leader != event)
2940        leader->pmu->read(leader);
2941
2942    values[n++] = atomic64_read(&leader->count);
2943    if (read_format & PERF_FORMAT_ID)
2944        values[n++] = primary_event_id(leader);
2945
2946    perf_output_copy(handle, values, n * sizeof(u64));
2947
2948    list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2949        n = 0;
2950
2951        if (sub != event)
2952            sub->pmu->read(sub);
2953
2954        values[n++] = atomic64_read(&sub->count);
2955        if (read_format & PERF_FORMAT_ID)
2956            values[n++] = primary_event_id(sub);
2957
2958        perf_output_copy(handle, values, n * sizeof(u64));
2959    }
2960}
2961
2962static void perf_output_read(struct perf_output_handle *handle,
2963                 struct perf_event *event)
2964{
2965    if (event->attr.read_format & PERF_FORMAT_GROUP)
2966        perf_output_read_group(handle, event);
2967    else
2968        perf_output_read_one(handle, event);
2969}
2970
2971void perf_output_sample(struct perf_output_handle *handle,
2972            struct perf_event_header *header,
2973            struct perf_sample_data *data,
2974            struct perf_event *event)
2975{
2976    u64 sample_type = data->type;
2977
2978    perf_output_put(handle, *header);
2979
2980    if (sample_type & PERF_SAMPLE_IP)
2981        perf_output_put(handle, data->ip);
2982
2983    if (sample_type & PERF_SAMPLE_TID)
2984        perf_output_put(handle, data->tid_entry);
2985
2986    if (sample_type & PERF_SAMPLE_TIME)
2987        perf_output_put(handle, data->time);
2988
2989    if (sample_type & PERF_SAMPLE_ADDR)
2990        perf_output_put(handle, data->addr);
2991
2992    if (sample_type & PERF_SAMPLE_ID)
2993        perf_output_put(handle, data->id);
2994
2995    if (sample_type & PERF_SAMPLE_STREAM_ID)
2996        perf_output_put(handle, data->stream_id);
2997
2998    if (sample_type & PERF_SAMPLE_CPU)
2999        perf_output_put(handle, data->cpu_entry);
3000
3001    if (sample_type & PERF_SAMPLE_PERIOD)
3002        perf_output_put(handle, data->period);
3003
3004    if (sample_type & PERF_SAMPLE_READ)
3005        perf_output_read(handle, event);
3006
3007    if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3008        if (data->callchain) {
3009            int size = 1;
3010
3011            if (data->callchain)
3012                size += data->callchain->nr;
3013
3014            size *= sizeof(u64);
3015
3016            perf_output_copy(handle, data->callchain, size);
3017        } else {
3018            u64 nr = 0;
3019            perf_output_put(handle, nr);
3020        }
3021    }
3022
3023    if (sample_type & PERF_SAMPLE_RAW) {
3024        if (data->raw) {
3025            perf_output_put(handle, data->raw->size);
3026            perf_output_copy(handle, data->raw->data,
3027                     data->raw->size);
3028        } else {
3029            struct {
3030                u32 size;
3031                u32 data;
3032            } raw = {
3033                .size = sizeof(u32),
3034                .data = 0,
3035            };
3036            perf_output_put(handle, raw);
3037        }
3038    }
3039}
3040
3041void perf_prepare_sample(struct perf_event_header *header,
3042             struct perf_sample_data *data,
3043             struct perf_event *event,
3044             struct pt_regs *regs)
3045{
3046    u64 sample_type = event->attr.sample_type;
3047
3048    data->type = sample_type;
3049
3050    header->type = PERF_RECORD_SAMPLE;
3051    header->size = sizeof(*header);
3052
3053    header->misc = 0;
3054    header->misc |= perf_misc_flags(regs);
3055
3056    if (sample_type & PERF_SAMPLE_IP) {
3057        data->ip = perf_instruction_pointer(regs);
3058
3059        header->size += sizeof(data->ip);
3060    }
3061
3062    if (sample_type & PERF_SAMPLE_TID) {
3063        /* namespace issues */
3064        data->tid_entry.pid = perf_event_pid(event, current);
3065        data->tid_entry.tid = perf_event_tid(event, current);
3066
3067        header->size += sizeof(data->tid_entry);
3068    }
3069
3070    if (sample_type & PERF_SAMPLE_TIME) {
3071        data->time = perf_clock();
3072
3073        header->size += sizeof(data->time);
3074    }
3075
3076    if (sample_type & PERF_SAMPLE_ADDR)
3077        header->size += sizeof(data->addr);
3078
3079    if (sample_type & PERF_SAMPLE_ID) {
3080        data->id = primary_event_id(event);
3081
3082        header->size += sizeof(data->id);
3083    }
3084
3085    if (sample_type & PERF_SAMPLE_STREAM_ID) {
3086        data->stream_id = event->id;
3087
3088        header->size += sizeof(data->stream_id);
3089    }
3090
3091    if (sample_type & PERF_SAMPLE_CPU) {
3092        data->cpu_entry.cpu = raw_smp_processor_id();
3093        data->cpu_entry.reserved = 0;
3094
3095        header->size += sizeof(data->cpu_entry);
3096    }
3097
3098    if (sample_type & PERF_SAMPLE_PERIOD)
3099        header->size += sizeof(data->period);
3100
3101    if (sample_type & PERF_SAMPLE_READ)
3102        header->size += perf_event_read_size(event);
3103
3104    if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3105        int size = 1;
3106
3107        data->callchain = perf_callchain(regs);
3108
3109        if (data->callchain)
3110            size += data->callchain->nr;
3111
3112        header->size += size * sizeof(u64);
3113    }
3114
3115    if (sample_type & PERF_SAMPLE_RAW) {
3116        int size = sizeof(u32);
3117
3118        if (data->raw)
3119            size += data->raw->size;
3120        else
3121            size += sizeof(u32);
3122
3123        WARN_ON_ONCE(size & (sizeof(u64)-1));
3124        header->size += size;
3125    }
3126}
3127
3128static void perf_event_output(struct perf_event *event, int nmi,
3129                struct perf_sample_data *data,
3130                struct pt_regs *regs)
3131{
3132    struct perf_output_handle handle;
3133    struct perf_event_header header;
3134
3135    perf_prepare_sample(&header, data, event, regs);
3136
3137    if (perf_output_begin(&handle, event, header.size, nmi, 1))
3138        return;
3139
3140    perf_output_sample(&handle, &header, data, event);
3141
3142    perf_output_end(&handle);
3143}
3144
3145/*
3146 * read event_id
3147 */
3148
3149struct perf_read_event {
3150    struct perf_event_header header;
3151
3152    u32 pid;
3153    u32 tid;
3154};
3155
3156static void
3157perf_event_read_event(struct perf_event *event,
3158            struct task_struct *task)
3159{
3160    struct perf_output_handle handle;
3161    struct perf_read_event read_event = {
3162        .header = {
3163            .type = PERF_RECORD_READ,
3164            .misc = 0,
3165            .size = sizeof(read_event) + perf_event_read_size(event),
3166        },
3167        .pid = perf_event_pid(event, task),
3168        .tid = perf_event_tid(event, task),
3169    };
3170    int ret;
3171
3172    ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3173    if (ret)
3174        return;
3175
3176    perf_output_put(&handle, read_event);
3177    perf_output_read(&handle, event);
3178
3179    perf_output_end(&handle);
3180}
3181
3182/*
3183 * task tracking -- fork/exit
3184 *
3185 * enabled by: attr.comm | attr.mmap | attr.task
3186 */
3187
3188struct perf_task_event {
3189    struct task_struct *task;
3190    struct perf_event_context *task_ctx;
3191
3192    struct {
3193        struct perf_event_header header;
3194
3195        u32 pid;
3196        u32 ppid;
3197        u32 tid;
3198        u32 ptid;
3199        u64 time;
3200    } event_id;
3201};
3202
3203static void perf_event_task_output(struct perf_event *event,
3204                     struct perf_task_event *task_event)
3205{
3206    struct perf_output_handle handle;
3207    int size;
3208    struct task_struct *task = task_event->task;
3209    int ret;
3210
3211    size = task_event->event_id.header.size;
3212    ret = perf_output_begin(&handle, event, size, 0, 0);
3213
3214    if (ret)
3215        return;
3216
3217    task_event->event_id.pid = perf_event_pid(event, task);
3218    task_event->event_id.ppid = perf_event_pid(event, current);
3219
3220    task_event->event_id.tid = perf_event_tid(event, task);
3221    task_event->event_id.ptid = perf_event_tid(event, current);
3222
3223    task_event->event_id.time = perf_clock();
3224
3225    perf_output_put(&handle, task_event->event_id);
3226
3227    perf_output_end(&handle);
3228}
3229
3230static int perf_event_task_match(struct perf_event *event)
3231{
3232    if (event->state != PERF_EVENT_STATE_ACTIVE)
3233        return 0;
3234
3235    if (event->cpu != -1 && event->cpu != smp_processor_id())
3236        return 0;
3237
3238    if (event->attr.comm || event->attr.mmap || event->attr.task)
3239        return 1;
3240
3241    return 0;
3242}
3243
3244static void perf_event_task_ctx(struct perf_event_context *ctx,
3245                  struct perf_task_event *task_event)
3246{
3247    struct perf_event *event;
3248
3249    if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3250        return;
3251
3252    rcu_read_lock();
3253    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3254        if (perf_event_task_match(event))
3255            perf_event_task_output(event, task_event);
3256    }
3257    rcu_read_unlock();
3258}
3259
3260static void perf_event_task_event(struct perf_task_event *task_event)
3261{
3262    struct perf_cpu_context *cpuctx;
3263    struct perf_event_context *ctx = task_event->task_ctx;
3264
3265    cpuctx = &get_cpu_var(perf_cpu_context);
3266    perf_event_task_ctx(&cpuctx->ctx, task_event);
3267
3268    rcu_read_lock();
3269    if (!ctx)
3270        ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3271    if (ctx)
3272        perf_event_task_ctx(ctx, task_event);
3273    put_cpu_var(perf_cpu_context);
3274    rcu_read_unlock();
3275}
3276
3277static void perf_event_task(struct task_struct *task,
3278                  struct perf_event_context *task_ctx,
3279                  int new)
3280{
3281    struct perf_task_event task_event;
3282
3283    if (!atomic_read(&nr_comm_events) &&
3284        !atomic_read(&nr_mmap_events) &&
3285        !atomic_read(&nr_task_events))
3286        return;
3287
3288    task_event = (struct perf_task_event){
3289        .task = task,
3290        .task_ctx = task_ctx,
3291        .event_id = {
3292            .header = {
3293                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3294                .misc = 0,
3295                .size = sizeof(task_event.event_id),
3296            },
3297            /* .pid */
3298            /* .ppid */
3299            /* .tid */
3300            /* .ptid */
3301        },
3302    };
3303
3304    perf_event_task_event(&task_event);
3305}
3306
3307void perf_event_fork(struct task_struct *task)
3308{
3309    perf_event_task(task, NULL, 1);
3310}
3311
3312/*
3313 * comm tracking
3314 */
3315
3316struct perf_comm_event {
3317    struct task_struct *task;
3318    char *comm;
3319    int comm_size;
3320
3321    struct {
3322        struct perf_event_header header;
3323
3324        u32 pid;
3325        u32 tid;
3326    } event_id;
3327};
3328
3329static void perf_event_comm_output(struct perf_event *event,
3330                     struct perf_comm_event *comm_event)
3331{
3332    struct perf_output_handle handle;
3333    int size = comm_event->event_id.header.size;
3334    int ret = perf_output_begin(&handle, event, size, 0, 0);
3335
3336    if (ret)
3337        return;
3338
3339    comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3340    comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3341
3342    perf_output_put(&handle, comm_event->event_id);
3343    perf_output_copy(&handle, comm_event->comm,
3344                   comm_event->comm_size);
3345    perf_output_end(&handle);
3346}
3347
3348static int perf_event_comm_match(struct perf_event *event)
3349{
3350    if (event->state != PERF_EVENT_STATE_ACTIVE)
3351        return 0;
3352
3353    if (event->cpu != -1 && event->cpu != smp_processor_id())
3354        return 0;
3355
3356    if (event->attr.comm)
3357        return 1;
3358
3359    return 0;
3360}
3361
3362static void perf_event_comm_ctx(struct perf_event_context *ctx,
3363                  struct perf_comm_event *comm_event)
3364{
3365    struct perf_event *event;
3366
3367    if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3368        return;
3369
3370    rcu_read_lock();
3371    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3372        if (perf_event_comm_match(event))
3373            perf_event_comm_output(event, comm_event);
3374    }
3375    rcu_read_unlock();
3376}
3377
3378static void perf_event_comm_event(struct perf_comm_event *comm_event)
3379{
3380    struct perf_cpu_context *cpuctx;
3381    struct perf_event_context *ctx;
3382    unsigned int size;
3383    char comm[TASK_COMM_LEN];
3384
3385    memset(comm, 0, sizeof(comm));
3386    strncpy(comm, comm_event->task->comm, sizeof(comm));
3387    size = ALIGN(strlen(comm)+1, sizeof(u64));
3388
3389    comm_event->comm = comm;
3390    comm_event->comm_size = size;
3391
3392    comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3393
3394    cpuctx = &get_cpu_var(perf_cpu_context);
3395    perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3396
3397    rcu_read_lock();
3398    /*
3399     * doesn't really matter which of the child contexts the
3400     * events ends up in.
3401     */
3402    ctx = rcu_dereference(current->perf_event_ctxp);
3403    if (ctx)
3404        perf_event_comm_ctx(ctx, comm_event);
3405    put_cpu_var(perf_cpu_context);
3406    rcu_read_unlock();
3407}
3408
3409void perf_event_comm(struct task_struct *task)
3410{
3411    struct perf_comm_event comm_event;
3412
3413    if (task->perf_event_ctxp)
3414        perf_event_enable_on_exec(task);
3415
3416    if (!atomic_read(&nr_comm_events))
3417        return;
3418
3419    comm_event = (struct perf_comm_event){
3420        .task = task,
3421        /* .comm */
3422        /* .comm_size */
3423        .event_id = {
3424            .header = {
3425                .type = PERF_RECORD_COMM,
3426                .misc = 0,
3427                /* .size */
3428            },
3429            /* .pid */
3430            /* .tid */
3431        },
3432    };
3433
3434    perf_event_comm_event(&comm_event);
3435}
3436
3437/*
3438 * mmap tracking
3439 */
3440
3441struct perf_mmap_event {
3442    struct vm_area_struct *vma;
3443
3444    const char *file_name;
3445    int file_size;
3446
3447    struct {
3448        struct perf_event_header header;
3449
3450        u32 pid;
3451        u32 tid;
3452        u64 start;
3453        u64 len;
3454        u64 pgoff;
3455    } event_id;
3456};
3457
3458static void perf_event_mmap_output(struct perf_event *event,
3459                     struct perf_mmap_event *mmap_event)
3460{
3461    struct perf_output_handle handle;
3462    int size = mmap_event->event_id.header.size;
3463    int ret = perf_output_begin(&handle, event, size, 0, 0);
3464
3465    if (ret)
3466        return;
3467
3468    mmap_event->event_id.pid = perf_event_pid(event, current);
3469    mmap_event->event_id.tid = perf_event_tid(event, current);
3470
3471    perf_output_put(&handle, mmap_event->event_id);
3472    perf_output_copy(&handle, mmap_event->file_name,
3473                   mmap_event->file_size);
3474    perf_output_end(&handle);
3475}
3476
3477static int perf_event_mmap_match(struct perf_event *event,
3478                   struct perf_mmap_event *mmap_event)
3479{
3480    if (event->state != PERF_EVENT_STATE_ACTIVE)
3481        return 0;
3482
3483    if (event->cpu != -1 && event->cpu != smp_processor_id())
3484        return 0;
3485
3486    if (event->attr.mmap)
3487        return 1;
3488
3489    return 0;
3490}
3491
3492static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3493                  struct perf_mmap_event *mmap_event)
3494{
3495    struct perf_event *event;
3496
3497    if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3498        return;
3499
3500    rcu_read_lock();
3501    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3502        if (perf_event_mmap_match(event, mmap_event))
3503            perf_event_mmap_output(event, mmap_event);
3504    }
3505    rcu_read_unlock();
3506}
3507
3508static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3509{
3510    struct perf_cpu_context *cpuctx;
3511    struct perf_event_context *ctx;
3512    struct vm_area_struct *vma = mmap_event->vma;
3513    struct file *file = vma->vm_file;
3514    unsigned int size;
3515    char tmp[16];
3516    char *buf = NULL;
3517    const char *name;
3518
3519    memset(tmp, 0, sizeof(tmp));
3520
3521    if (file) {
3522        /*
3523         * d_path works from the end of the buffer backwards, so we
3524         * need to add enough zero bytes after the string to handle
3525         * the 64bit alignment we do later.
3526         */
3527        buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3528        if (!buf) {
3529            name = strncpy(tmp, "//enomem", sizeof(tmp));
3530            goto got_name;
3531        }
3532        name = d_path(&file->f_path, buf, PATH_MAX);
3533        if (IS_ERR(name)) {
3534            name = strncpy(tmp, "//toolong", sizeof(tmp));
3535            goto got_name;
3536        }
3537    } else {
3538        if (arch_vma_name(mmap_event->vma)) {
3539            name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3540                       sizeof(tmp));
3541            goto got_name;
3542        }
3543
3544        if (!vma->vm_mm) {
3545            name = strncpy(tmp, "[vdso]", sizeof(tmp));
3546            goto got_name;
3547        }
3548
3549        name = strncpy(tmp, "//anon", sizeof(tmp));
3550        goto got_name;
3551    }
3552
3553got_name:
3554    size = ALIGN(strlen(name)+1, sizeof(u64));
3555
3556    mmap_event->file_name = name;
3557    mmap_event->file_size = size;
3558
3559    mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3560
3561    cpuctx = &get_cpu_var(perf_cpu_context);
3562    perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3563
3564    rcu_read_lock();
3565    /*
3566     * doesn't really matter which of the child contexts the
3567     * events ends up in.
3568     */
3569    ctx = rcu_dereference(current->perf_event_ctxp);
3570    if (ctx)
3571        perf_event_mmap_ctx(ctx, mmap_event);
3572    put_cpu_var(perf_cpu_context);
3573    rcu_read_unlock();
3574
3575    kfree(buf);
3576}
3577
3578void __perf_event_mmap(struct vm_area_struct *vma)
3579{
3580    struct perf_mmap_event mmap_event;
3581
3582    if (!atomic_read(&nr_mmap_events))
3583        return;
3584
3585    mmap_event = (struct perf_mmap_event){
3586        .vma = vma,
3587        /* .file_name */
3588        /* .file_size */
3589        .event_id = {
3590            .header = {
3591                .type = PERF_RECORD_MMAP,
3592                .misc = 0,
3593                /* .size */
3594            },
3595            /* .pid */
3596            /* .tid */
3597            .start = vma->vm_start,
3598            .len = vma->vm_end - vma->vm_start,
3599            .pgoff = vma->vm_pgoff,
3600        },
3601    };
3602
3603    perf_event_mmap_event(&mmap_event);
3604}
3605
3606/*
3607 * IRQ throttle logging
3608 */
3609
3610static void perf_log_throttle(struct perf_event *event, int enable)
3611{
3612    struct perf_output_handle handle;
3613    int ret;
3614
3615    struct {
3616        struct perf_event_header header;
3617        u64 time;
3618        u64 id;
3619        u64 stream_id;
3620    } throttle_event = {
3621        .header = {
3622            .type = PERF_RECORD_THROTTLE,
3623            .misc = 0,
3624            .size = sizeof(throttle_event),
3625        },
3626        .time = perf_clock(),
3627        .id = primary_event_id(event),
3628        .stream_id = event->id,
3629    };
3630
3631    if (enable)
3632        throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3633
3634    ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3635    if (ret)
3636        return;
3637
3638    perf_output_put(&handle, throttle_event);
3639    perf_output_end(&handle);
3640}
3641
3642/*
3643 * Generic event overflow handling, sampling.
3644 */
3645
3646static int __perf_event_overflow(struct perf_event *event, int nmi,
3647                   int throttle, struct perf_sample_data *data,
3648                   struct pt_regs *regs)
3649{
3650    int events = atomic_read(&event->event_limit);
3651    struct hw_perf_event *hwc = &event->hw;
3652    int ret = 0;
3653
3654    throttle = (throttle && event->pmu->unthrottle != NULL);
3655
3656    if (!throttle) {
3657        hwc->interrupts++;
3658    } else {
3659        if (hwc->interrupts != MAX_INTERRUPTS) {
3660            hwc->interrupts++;
3661            if (HZ * hwc->interrupts >
3662                    (u64)sysctl_perf_event_sample_rate) {
3663                hwc->interrupts = MAX_INTERRUPTS;
3664                perf_log_throttle(event, 0);
3665                ret = 1;
3666            }
3667        } else {
3668            /*
3669             * Keep re-disabling events even though on the previous
3670             * pass we disabled it - just in case we raced with a
3671             * sched-in and the event got enabled again:
3672             */
3673            ret = 1;
3674        }
3675    }
3676
3677    if (event->attr.freq) {
3678        u64 now = perf_clock();
3679        s64 delta = now - hwc->freq_stamp;
3680
3681        hwc->freq_stamp = now;
3682
3683        if (delta > 0 && delta < TICK_NSEC)
3684            perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3685    }
3686
3687    /*
3688     * XXX event_limit might not quite work as expected on inherited
3689     * events
3690     */
3691
3692    event->pending_kill = POLL_IN;
3693    if (events && atomic_dec_and_test(&event->event_limit)) {
3694        ret = 1;
3695        event->pending_kill = POLL_HUP;
3696        if (nmi) {
3697            event->pending_disable = 1;
3698            perf_pending_queue(&event->pending,
3699                       perf_pending_event);
3700        } else
3701            perf_event_disable(event);
3702    }
3703
3704    perf_event_output(event, nmi, data, regs);
3705    return ret;
3706}
3707
3708int perf_event_overflow(struct perf_event *event, int nmi,
3709              struct perf_sample_data *data,
3710              struct pt_regs *regs)
3711{
3712    return __perf_event_overflow(event, nmi, 1, data, regs);
3713}
3714
3715/*
3716 * Generic software event infrastructure
3717 */
3718
3719/*
3720 * We directly increment event->count and keep a second value in
3721 * event->hw.period_left to count intervals. This period event
3722 * is kept in the range [-sample_period, 0] so that we can use the
3723 * sign as trigger.
3724 */
3725
3726static u64 perf_swevent_set_period(struct perf_event *event)
3727{
3728    struct hw_perf_event *hwc = &event->hw;
3729    u64 period = hwc->last_period;
3730    u64 nr, offset;
3731    s64 old, val;
3732
3733    hwc->last_period = hwc->sample_period;
3734
3735again:
3736    old = val = atomic64_read(&hwc->period_left);
3737    if (val < 0)
3738        return 0;
3739
3740    nr = div64_u64(period + val, period);
3741    offset = nr * period;
3742    val -= offset;
3743    if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3744        goto again;
3745
3746    return nr;
3747}
3748
3749static void perf_swevent_overflow(struct perf_event *event,
3750                    int nmi, struct perf_sample_data *data,
3751                    struct pt_regs *regs)
3752{
3753    struct hw_perf_event *hwc = &event->hw;
3754    int throttle = 0;
3755    u64 overflow;
3756
3757    data->period = event->hw.last_period;
3758    overflow = perf_swevent_set_period(event);
3759
3760    if (hwc->interrupts == MAX_INTERRUPTS)
3761        return;
3762
3763    for (; overflow; overflow--) {
3764        if (__perf_event_overflow(event, nmi, throttle,
3765                        data, regs)) {
3766            /*
3767             * We inhibit the overflow from happening when
3768             * hwc->interrupts == MAX_INTERRUPTS.
3769             */
3770            break;
3771        }
3772        throttle = 1;
3773    }
3774}
3775
3776static void perf_swevent_unthrottle(struct perf_event *event)
3777{
3778    /*
3779     * Nothing to do, we already reset hwc->interrupts.
3780     */
3781}
3782
3783static void perf_swevent_add(struct perf_event *event, u64 nr,
3784                   int nmi, struct perf_sample_data *data,
3785                   struct pt_regs *regs)
3786{
3787    struct hw_perf_event *hwc = &event->hw;
3788
3789    atomic64_add(nr, &event->count);
3790
3791    if (!hwc->sample_period)
3792        return;
3793
3794    if (!regs)
3795        return;
3796
3797    if (!atomic64_add_negative(nr, &hwc->period_left))
3798        perf_swevent_overflow(event, nmi, data, regs);
3799}
3800
3801static int perf_swevent_is_counting(struct perf_event *event)
3802{
3803    /*
3804     * The event is active, we're good!
3805     */
3806    if (event->state == PERF_EVENT_STATE_ACTIVE)
3807        return 1;
3808
3809    /*
3810     * The event is off/error, not counting.
3811     */
3812    if (event->state != PERF_EVENT_STATE_INACTIVE)
3813        return 0;
3814
3815    /*
3816     * The event is inactive, if the context is active
3817     * we're part of a group that didn't make it on the 'pmu',
3818     * not counting.
3819     */
3820    if (event->ctx->is_active)
3821        return 0;
3822
3823    /*
3824     * We're inactive and the context is too, this means the
3825     * task is scheduled out, we're counting events that happen
3826     * to us, like migration events.
3827     */
3828    return 1;
3829}
3830
3831static int perf_swevent_match(struct perf_event *event,
3832                enum perf_type_id type,
3833                u32 event_id, struct pt_regs *regs)
3834{
3835    if (event->cpu != -1 && event->cpu != smp_processor_id())
3836        return 0;
3837
3838    if (!perf_swevent_is_counting(event))
3839        return 0;
3840
3841    if (event->attr.type != type)
3842        return 0;
3843    if (event->attr.config != event_id)
3844        return 0;
3845
3846    if (regs) {
3847        if (event->attr.exclude_user && user_mode(regs))
3848            return 0;
3849
3850        if (event->attr.exclude_kernel && !user_mode(regs))
3851            return 0;
3852    }
3853
3854    return 1;
3855}
3856
3857static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3858                     enum perf_type_id type,
3859                     u32 event_id, u64 nr, int nmi,
3860                     struct perf_sample_data *data,
3861                     struct pt_regs *regs)
3862{
3863    struct perf_event *event;
3864
3865    if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3866        return;
3867
3868    rcu_read_lock();
3869    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3870        if (perf_swevent_match(event, type, event_id, regs))
3871            perf_swevent_add(event, nr, nmi, data, regs);
3872    }
3873    rcu_read_unlock();
3874}
3875
3876static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3877{
3878    if (in_nmi())
3879        return &cpuctx->recursion[3];
3880
3881    if (in_irq())
3882        return &cpuctx->recursion[2];
3883
3884    if (in_softirq())
3885        return &cpuctx->recursion[1];
3886
3887    return &cpuctx->recursion[0];
3888}
3889
3890static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3891                    u64 nr, int nmi,
3892                    struct perf_sample_data *data,
3893                    struct pt_regs *regs)
3894{
3895    struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3896    int *recursion = perf_swevent_recursion_context(cpuctx);
3897    struct perf_event_context *ctx;
3898
3899    if (*recursion)
3900        goto out;
3901
3902    (*recursion)++;
3903    barrier();
3904
3905    perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3906                 nr, nmi, data, regs);
3907    rcu_read_lock();
3908    /*
3909     * doesn't really matter which of the child contexts the
3910     * events ends up in.
3911     */
3912    ctx = rcu_dereference(current->perf_event_ctxp);
3913    if (ctx)
3914        perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3915    rcu_read_unlock();
3916
3917    barrier();
3918    (*recursion)--;
3919
3920out:
3921    put_cpu_var(perf_cpu_context);
3922}
3923
3924void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3925                struct pt_regs *regs, u64 addr)
3926{
3927    struct perf_sample_data data = {
3928        .addr = addr,
3929    };
3930
3931    do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3932                &data, regs);
3933}
3934
3935static void perf_swevent_read(struct perf_event *event)
3936{
3937}
3938
3939static int perf_swevent_enable(struct perf_event *event)
3940{
3941    struct hw_perf_event *hwc = &event->hw;
3942
3943    if (hwc->sample_period) {
3944        hwc->last_period = hwc->sample_period;
3945        perf_swevent_set_period(event);
3946    }
3947    return 0;
3948}
3949
3950static void perf_swevent_disable(struct perf_event *event)
3951{
3952}
3953
3954static const struct pmu perf_ops_generic = {
3955    .enable = perf_swevent_enable,
3956    .disable = perf_swevent_disable,
3957    .read = perf_swevent_read,
3958    .unthrottle = perf_swevent_unthrottle,
3959};
3960
3961/*
3962 * hrtimer based swevent callback
3963 */
3964
3965static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3966{
3967    enum hrtimer_restart ret = HRTIMER_RESTART;
3968    struct perf_sample_data data;
3969    struct pt_regs *regs;
3970    struct perf_event *event;
3971    u64 period;
3972
3973    event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3974    event->pmu->read(event);
3975
3976    data.addr = 0;
3977    data.period = event->hw.last_period;
3978    regs = get_irq_regs();
3979    /*
3980     * In case we exclude kernel IPs or are somehow not in interrupt
3981     * context, provide the next best thing, the user IP.
3982     */
3983    if ((event->attr.exclude_kernel || !regs) &&
3984            !event->attr.exclude_user)
3985        regs = task_pt_regs(current);
3986
3987    if (regs) {
3988        if (!(event->attr.exclude_idle && current->pid == 0))
3989            if (perf_event_overflow(event, 0, &data, regs))
3990                ret = HRTIMER_NORESTART;
3991    }
3992
3993    period = max_t(u64, 10000, event->hw.sample_period);
3994    hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3995
3996    return ret;
3997}
3998
3999static void perf_swevent_start_hrtimer(struct perf_event *event)
4000{
4001    struct hw_perf_event *hwc = &event->hw;
4002
4003    hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4004    hwc->hrtimer.function = perf_swevent_hrtimer;
4005    if (hwc->sample_period) {
4006        u64 period;
4007
4008        if (hwc->remaining) {
4009            if (hwc->remaining < 0)
4010                period = 10000;
4011            else
4012                period = hwc->remaining;
4013            hwc->remaining = 0;
4014        } else {
4015            period = max_t(u64, 10000, hwc->sample_period);
4016        }
4017        __hrtimer_start_range_ns(&hwc->hrtimer,
4018                ns_to_ktime(period), 0,
4019                HRTIMER_MODE_REL, 0);
4020    }
4021}
4022
4023static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4024{
4025    struct hw_perf_event *hwc = &event->hw;
4026
4027    if (hwc->sample_period) {
4028        ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4029        hwc->remaining = ktime_to_ns(remaining);
4030
4031        hrtimer_cancel(&hwc->hrtimer);
4032    }
4033}
4034
4035/*
4036 * Software event: cpu wall time clock
4037 */
4038
4039static void cpu_clock_perf_event_update(struct perf_event *event)
4040{
4041    int cpu = raw_smp_processor_id();
4042    s64 prev;
4043    u64 now;
4044
4045    now = cpu_clock(cpu);
4046    prev = atomic64_read(&event->hw.prev_count);
4047    atomic64_set(&event->hw.prev_count, now);
4048    atomic64_add(now - prev, &event->count);
4049}
4050
4051static int cpu_clock_perf_event_enable(struct perf_event *event)
4052{
4053    struct hw_perf_event *hwc = &event->hw;
4054    int cpu = raw_smp_processor_id();
4055
4056    atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4057    perf_swevent_start_hrtimer(event);
4058
4059    return 0;
4060}
4061
4062static void cpu_clock_perf_event_disable(struct perf_event *event)
4063{
4064    perf_swevent_cancel_hrtimer(event);
4065    cpu_clock_perf_event_update(event);
4066}
4067
4068static void cpu_clock_perf_event_read(struct perf_event *event)
4069{
4070    cpu_clock_perf_event_update(event);
4071}
4072
4073static const struct pmu perf_ops_cpu_clock = {
4074    .enable = cpu_clock_perf_event_enable,
4075    .disable = cpu_clock_perf_event_disable,
4076    .read = cpu_clock_perf_event_read,
4077};
4078
4079/*
4080 * Software event: task time clock
4081 */
4082
4083static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4084{
4085    u64 prev;
4086    s64 delta;
4087
4088    prev = atomic64_xchg(&event->hw.prev_count, now);
4089    delta = now - prev;
4090    atomic64_add(delta, &event->count);
4091}
4092
4093static int task_clock_perf_event_enable(struct perf_event *event)
4094{
4095    struct hw_perf_event *hwc = &event->hw;
4096    u64 now;
4097
4098    now = event->ctx->time;
4099
4100    atomic64_set(&hwc->prev_count, now);
4101
4102    perf_swevent_start_hrtimer(event);
4103
4104    return 0;
4105}
4106
4107static void task_clock_perf_event_disable(struct perf_event *event)
4108{
4109    perf_swevent_cancel_hrtimer(event);
4110    task_clock_perf_event_update(event, event->ctx->time);
4111
4112}
4113
4114static void task_clock_perf_event_read(struct perf_event *event)
4115{
4116    u64 time;
4117
4118    if (!in_nmi()) {
4119        update_context_time(event->ctx);
4120        time = event->ctx->time;
4121    } else {
4122        u64 now = perf_clock();
4123        u64 delta = now - event->ctx->timestamp;
4124        time = event->ctx->time + delta;
4125    }
4126
4127    task_clock_perf_event_update(event, time);
4128}
4129
4130static const struct pmu perf_ops_task_clock = {
4131    .enable = task_clock_perf_event_enable,
4132    .disable = task_clock_perf_event_disable,
4133    .read = task_clock_perf_event_read,
4134};
4135
4136#ifdef CONFIG_EVENT_PROFILE
4137void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4138              int entry_size)
4139{
4140    struct perf_raw_record raw = {
4141        .size = entry_size,
4142        .data = record,
4143    };
4144
4145    struct perf_sample_data data = {
4146        .addr = addr,
4147        .raw = &raw,
4148    };
4149
4150    struct pt_regs *regs = get_irq_regs();
4151
4152    if (!regs)
4153        regs = task_pt_regs(current);
4154
4155    do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4156                &data, regs);
4157}
4158EXPORT_SYMBOL_GPL(perf_tp_event);
4159
4160extern int ftrace_profile_enable(int);
4161extern void ftrace_profile_disable(int);
4162
4163static void tp_perf_event_destroy(struct perf_event *event)
4164{
4165    ftrace_profile_disable(event->attr.config);
4166}
4167
4168static const struct pmu *tp_perf_event_init(struct perf_event *event)
4169{
4170    /*
4171     * Raw tracepoint data is a severe data leak, only allow root to
4172     * have these.
4173     */
4174    if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4175            perf_paranoid_tracepoint_raw() &&
4176            !capable(CAP_SYS_ADMIN))
4177        return ERR_PTR(-EPERM);
4178
4179    if (ftrace_profile_enable(event->attr.config))
4180        return NULL;
4181
4182    event->destroy = tp_perf_event_destroy;
4183
4184    return &perf_ops_generic;
4185}
4186#else
4187static const struct pmu *tp_perf_event_init(struct perf_event *event)
4188{
4189    return NULL;
4190}
4191#endif
4192
4193atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4194
4195static void sw_perf_event_destroy(struct perf_event *event)
4196{
4197    u64 event_id = event->attr.config;
4198
4199    WARN_ON(event->parent);
4200
4201    atomic_dec(&perf_swevent_enabled[event_id]);
4202}
4203
4204static const struct pmu *sw_perf_event_init(struct perf_event *event)
4205{
4206    const struct pmu *pmu = NULL;
4207    u64 event_id = event->attr.config;
4208
4209    /*
4210     * Software events (currently) can't in general distinguish
4211     * between user, kernel and hypervisor events.
4212     * However, context switches and cpu migrations are considered
4213     * to be kernel events, and page faults are never hypervisor
4214     * events.
4215     */
4216    switch (event_id) {
4217    case PERF_COUNT_SW_CPU_CLOCK:
4218        pmu = &perf_ops_cpu_clock;
4219
4220        break;
4221    case PERF_COUNT_SW_TASK_CLOCK:
4222        /*
4223         * If the user instantiates this as a per-cpu event,
4224         * use the cpu_clock event instead.
4225         */
4226        if (event->ctx->task)
4227            pmu = &perf_ops_task_clock;
4228        else
4229            pmu = &perf_ops_cpu_clock;
4230
4231        break;
4232    case PERF_COUNT_SW_PAGE_FAULTS:
4233    case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4234    case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4235    case PERF_COUNT_SW_CONTEXT_SWITCHES:
4236    case PERF_COUNT_SW_CPU_MIGRATIONS:
4237        if (!event->parent) {
4238            atomic_inc(&perf_swevent_enabled[event_id]);
4239            event->destroy = sw_perf_event_destroy;
4240        }
4241        pmu = &perf_ops_generic;
4242        break;
4243    }
4244
4245    return pmu;
4246}
4247
4248/*
4249 * Allocate and initialize a event structure
4250 */
4251static struct perf_event *
4252perf_event_alloc(struct perf_event_attr *attr,
4253           int cpu,
4254           struct perf_event_context *ctx,
4255           struct perf_event *group_leader,
4256           struct perf_event *parent_event,
4257           gfp_t gfpflags)
4258{
4259    const struct pmu *pmu;
4260    struct perf_event *event;
4261    struct hw_perf_event *hwc;
4262    long err;
4263
4264    event = kzalloc(sizeof(*event), gfpflags);
4265    if (!event)
4266        return ERR_PTR(-ENOMEM);
4267
4268    /*
4269     * Single events are their own group leaders, with an
4270     * empty sibling list:
4271     */
4272    if (!group_leader)
4273        group_leader = event;
4274
4275    mutex_init(&event->child_mutex);
4276    INIT_LIST_HEAD(&event->child_list);
4277
4278    INIT_LIST_HEAD(&event->group_entry);
4279    INIT_LIST_HEAD(&event->event_entry);
4280    INIT_LIST_HEAD(&event->sibling_list);
4281    init_waitqueue_head(&event->waitq);
4282
4283    mutex_init(&event->mmap_mutex);
4284
4285    event->cpu = cpu;
4286    event->attr = *attr;
4287    event->group_leader = group_leader;
4288    event->pmu = NULL;
4289    event->ctx = ctx;
4290    event->oncpu = -1;
4291
4292    event->parent = parent_event;
4293
4294    event->ns = get_pid_ns(current->nsproxy->pid_ns);
4295    event->id = atomic64_inc_return(&perf_event_id);
4296
4297    event->state = PERF_EVENT_STATE_INACTIVE;
4298
4299    if (attr->disabled)
4300        event->state = PERF_EVENT_STATE_OFF;
4301
4302    pmu = NULL;
4303
4304    hwc = &event->hw;
4305    hwc->sample_period = attr->sample_period;
4306    if (attr->freq && attr->sample_freq)
4307        hwc->sample_period = 1;
4308    hwc->last_period = hwc->sample_period;
4309
4310    atomic64_set(&hwc->period_left, hwc->sample_period);
4311
4312    /*
4313     * we currently do not support PERF_FORMAT_GROUP on inherited events
4314     */
4315    if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4316        goto done;
4317
4318    switch (attr->type) {
4319    case PERF_TYPE_RAW:
4320    case PERF_TYPE_HARDWARE:
4321    case PERF_TYPE_HW_CACHE:
4322        pmu = hw_perf_event_init(event);
4323        break;
4324
4325    case PERF_TYPE_SOFTWARE:
4326        pmu = sw_perf_event_init(event);
4327        break;
4328
4329    case PERF_TYPE_TRACEPOINT:
4330        pmu = tp_perf_event_init(event);
4331        break;
4332
4333    default:
4334        break;
4335    }
4336done:
4337    err = 0;
4338    if (!pmu)
4339        err = -EINVAL;
4340    else if (IS_ERR(pmu))
4341        err = PTR_ERR(pmu);
4342
4343    if (err) {
4344        if (event->ns)
4345            put_pid_ns(event->ns);
4346        kfree(event);
4347        return ERR_PTR(err);
4348    }
4349
4350    event->pmu = pmu;
4351
4352    if (!event->parent) {
4353        atomic_inc(&nr_events);
4354        if (event->attr.mmap)
4355            atomic_inc(&nr_mmap_events);
4356        if (event->attr.comm)
4357            atomic_inc(&nr_comm_events);
4358        if (event->attr.task)
4359            atomic_inc(&nr_task_events);
4360    }
4361
4362    return event;
4363}
4364
4365static int perf_copy_attr(struct perf_event_attr __user *uattr,
4366              struct perf_event_attr *attr)
4367{
4368    u32 size;
4369    int ret;
4370
4371    if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4372        return -EFAULT;
4373
4374    /*
4375     * zero the full structure, so that a short copy will be nice.
4376     */
4377    memset(attr, 0, sizeof(*attr));
4378
4379    ret = get_user(size, &uattr->size);
4380    if (ret)
4381        return ret;
4382
4383    if (size > PAGE_SIZE) /* silly large */
4384        goto err_size;
4385
4386    if (!size) /* abi compat */
4387        size = PERF_ATTR_SIZE_VER0;
4388
4389    if (size < PERF_ATTR_SIZE_VER0)
4390        goto err_size;
4391
4392    /*
4393     * If we're handed a bigger struct than we know of,
4394     * ensure all the unknown bits are 0 - i.e. new
4395     * user-space does not rely on any kernel feature
4396     * extensions we dont know about yet.
4397     */
4398    if (size > sizeof(*attr)) {
4399        unsigned char __user *addr;
4400        unsigned char __user *end;
4401        unsigned char val;
4402
4403        addr = (void __user *)uattr + sizeof(*attr);
4404        end = (void __user *)uattr + size;
4405
4406        for (; addr < end; addr++) {
4407            ret = get_user(val, addr);
4408            if (ret)
4409                return ret;
4410            if (val)
4411                goto err_size;
4412        }
4413        size = sizeof(*attr);
4414    }
4415
4416    ret = copy_from_user(attr, uattr, size);
4417    if (ret)
4418        return -EFAULT;
4419
4420    /*
4421     * If the type exists, the corresponding creation will verify
4422     * the attr->config.
4423     */
4424    if (attr->type >= PERF_TYPE_MAX)
4425        return -EINVAL;
4426
4427    if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4428        return -EINVAL;
4429
4430    if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4431        return -EINVAL;
4432
4433    if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4434        return -EINVAL;
4435
4436out:
4437    return ret;
4438
4439err_size:
4440    put_user(sizeof(*attr), &uattr->size);
4441    ret = -E2BIG;
4442    goto out;
4443}
4444
4445int perf_event_set_output(struct perf_event *event, int output_fd)
4446{
4447    struct perf_event *output_event = NULL;
4448    struct file *output_file = NULL;
4449    struct perf_event *old_output;
4450    int fput_needed = 0;
4451    int ret = -EINVAL;
4452
4453    if (!output_fd)
4454        goto set;
4455
4456    output_file = fget_light(output_fd, &fput_needed);
4457    if (!output_file)
4458        return -EBADF;
4459
4460    if (output_file->f_op != &perf_fops)
4461        goto out;
4462
4463    output_event = output_file->private_data;
4464
4465    /* Don't chain output fds */
4466    if (output_event->output)
4467        goto out;
4468
4469    /* Don't set an output fd when we already have an output channel */
4470    if (event->data)
4471        goto out;
4472
4473    atomic_long_inc(&output_file->f_count);
4474
4475set:
4476    mutex_lock(&event->mmap_mutex);
4477    old_output = event->output;
4478    rcu_assign_pointer(event->output, output_event);
4479    mutex_unlock(&event->mmap_mutex);
4480
4481    if (old_output) {
4482        /*
4483         * we need to make sure no existing perf_output_*()
4484         * is still referencing this event.
4485         */
4486        synchronize_rcu();
4487        fput(old_output->filp);
4488    }
4489
4490    ret = 0;
4491out:
4492    fput_light(output_file, fput_needed);
4493    return ret;
4494}
4495
4496/**
4497 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4498 *
4499 * @attr_uptr: event_id type attributes for monitoring/sampling
4500 * @pid: target pid
4501 * @cpu: target cpu
4502 * @group_fd: group leader event fd
4503 */
4504SYSCALL_DEFINE5(perf_event_open,
4505        struct perf_event_attr __user *, attr_uptr,
4506        pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4507{
4508    struct perf_event *event, *group_leader;
4509    struct perf_event_attr attr;
4510    struct perf_event_context *ctx;
4511    struct file *event_file = NULL;
4512    struct file *group_file = NULL;
4513    int fput_needed = 0;
4514    int fput_needed2 = 0;
4515    int err;
4516
4517    /* for future expandability... */
4518    if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4519        return -EINVAL;
4520
4521    err = perf_copy_attr(attr_uptr, &attr);
4522    if (err)
4523        return err;
4524
4525    if (!attr.exclude_kernel) {
4526        if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4527            return -EACCES;
4528    }
4529
4530    if (attr.freq) {
4531        if (attr.sample_freq > sysctl_perf_event_sample_rate)
4532            return -EINVAL;
4533    }
4534
4535    /*
4536     * Get the target context (task or percpu):
4537     */
4538    ctx = find_get_context(pid, cpu);
4539    if (IS_ERR(ctx))
4540        return PTR_ERR(ctx);
4541
4542    /*
4543     * Look up the group leader (we will attach this event to it):
4544     */
4545    group_leader = NULL;
4546    if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4547        err = -EINVAL;
4548        group_file = fget_light(group_fd, &fput_needed);
4549        if (!group_file)
4550            goto err_put_context;
4551        if (group_file->f_op != &perf_fops)
4552            goto err_put_context;
4553
4554        group_leader = group_file->private_data;
4555        /*
4556         * Do not allow a recursive hierarchy (this new sibling
4557         * becoming part of another group-sibling):
4558         */
4559        if (group_leader->group_leader != group_leader)
4560            goto err_put_context;
4561        /*
4562         * Do not allow to attach to a group in a different
4563         * task or CPU context:
4564         */
4565        if (group_leader->ctx != ctx)
4566            goto err_put_context;
4567        /*
4568         * Only a group leader can be exclusive or pinned
4569         */
4570        if (attr.exclusive || attr.pinned)
4571            goto err_put_context;
4572    }
4573
4574    event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4575                     NULL, GFP_KERNEL);
4576    err = PTR_ERR(event);
4577    if (IS_ERR(event))
4578        goto err_put_context;
4579
4580    err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4581    if (err < 0)
4582        goto err_free_put_context;
4583
4584    event_file = fget_light(err, &fput_needed2);
4585    if (!event_file)
4586        goto err_free_put_context;
4587
4588    if (flags & PERF_FLAG_FD_OUTPUT) {
4589        err = perf_event_set_output(event, group_fd);
4590        if (err)
4591            goto err_fput_free_put_context;
4592    }
4593
4594    event->filp = event_file;
4595    WARN_ON_ONCE(ctx->parent_ctx);
4596    mutex_lock(&ctx->mutex);
4597    perf_install_in_context(ctx, event, cpu);
4598    ++ctx->generation;
4599    mutex_unlock(&ctx->mutex);
4600
4601    event->owner = current;
4602    get_task_struct(current);
4603    mutex_lock(&current->perf_event_mutex);
4604    list_add_tail(&event->owner_entry, &current->perf_event_list);
4605    mutex_unlock(&current->perf_event_mutex);
4606
4607err_fput_free_put_context:
4608    fput_light(event_file, fput_needed2);
4609
4610err_free_put_context:
4611    if (err < 0)
4612        kfree(event);
4613
4614err_put_context:
4615    if (err < 0)
4616        put_ctx(ctx);
4617
4618    fput_light(group_file, fput_needed);
4619
4620    return err;
4621}
4622
4623/*
4624 * inherit a event from parent task to child task:
4625 */
4626static struct perf_event *
4627inherit_event(struct perf_event *parent_event,
4628          struct task_struct *parent,
4629          struct perf_event_context *parent_ctx,
4630          struct task_struct *child,
4631          struct perf_event *group_leader,
4632          struct perf_event_context *child_ctx)
4633{
4634    struct perf_event *child_event;
4635
4636    /*
4637     * Instead of creating recursive hierarchies of events,
4638     * we link inherited events back to the original parent,
4639     * which has a filp for sure, which we use as the reference
4640     * count:
4641     */
4642    if (parent_event->parent)
4643        parent_event = parent_event->parent;
4644
4645    child_event = perf_event_alloc(&parent_event->attr,
4646                       parent_event->cpu, child_ctx,
4647                       group_leader, parent_event,
4648                       GFP_KERNEL);
4649    if (IS_ERR(child_event))
4650        return child_event;
4651    get_ctx(child_ctx);
4652
4653    /*
4654     * Make the child state follow the state of the parent event,
4655     * not its attr.disabled bit. We hold the parent's mutex,
4656     * so we won't race with perf_event_{en, dis}able_family.
4657     */
4658    if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4659        child_event->state = PERF_EVENT_STATE_INACTIVE;
4660    else
4661        child_event->state = PERF_EVENT_STATE_OFF;
4662
4663    if (parent_event->attr.freq)
4664        child_event->hw.sample_period = parent_event->hw.sample_period;
4665
4666    /*
4667     * Link it up in the child's context:
4668     */
4669    add_event_to_ctx(child_event, child_ctx);
4670
4671    /*
4672     * Get a reference to the parent filp - we will fput it
4673     * when the child event exits. This is safe to do because
4674     * we are in the parent and we know that the filp still
4675     * exists and has a nonzero count:
4676     */
4677    atomic_long_inc(&parent_event->filp->f_count);
4678
4679    /*
4680     * Link this into the parent event's child list
4681     */
4682    WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4683    mutex_lock(&parent_event->child_mutex);
4684    list_add_tail(&child_event->child_list, &parent_event->child_list);
4685    mutex_unlock(&parent_event->child_mutex);
4686
4687    return child_event;
4688}
4689
4690static int inherit_group(struct perf_event *parent_event,
4691          struct task_struct *parent,
4692          struct perf_event_context *parent_ctx,
4693          struct task_struct *child,
4694          struct perf_event_context *child_ctx)
4695{
4696    struct perf_event *leader;
4697    struct perf_event *sub;
4698    struct perf_event *child_ctr;
4699
4700    leader = inherit_event(parent_event, parent, parent_ctx,
4701                 child, NULL, child_ctx);
4702    if (IS_ERR(leader))
4703        return PTR_ERR(leader);
4704    list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4705        child_ctr = inherit_event(sub, parent, parent_ctx,
4706                        child, leader, child_ctx);
4707        if (IS_ERR(child_ctr))
4708            return PTR_ERR(child_ctr);
4709    }
4710    return 0;
4711}
4712
4713static void sync_child_event(struct perf_event *child_event,
4714                   struct task_struct *child)
4715{
4716    struct perf_event *parent_event = child_event->parent;
4717    u64 child_val;
4718
4719    if (child_event->attr.inherit_stat)
4720        perf_event_read_event(child_event, child);
4721
4722    child_val = atomic64_read(&child_event->count);
4723
4724    /*
4725     * Add back the child's count to the parent's count:
4726     */
4727    atomic64_add(child_val, &parent_event->count);
4728    atomic64_add(child_event->total_time_enabled,
4729             &parent_event->child_total_time_enabled);
4730    atomic64_add(child_event->total_time_running,
4731             &parent_event->child_total_time_running);
4732
4733    /*
4734     * Remove this event from the parent's list
4735     */
4736    WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4737    mutex_lock(&parent_event->child_mutex);
4738    list_del_init(&child_event->child_list);
4739    mutex_unlock(&parent_event->child_mutex);
4740
4741    /*
4742     * Release the parent event, if this was the last
4743     * reference to it.
4744     */
4745    fput(parent_event->filp);
4746}
4747
4748static void
4749__perf_event_exit_task(struct perf_event *child_event,
4750             struct perf_event_context *child_ctx,
4751             struct task_struct *child)
4752{
4753    struct perf_event *parent_event;
4754
4755    update_event_times(child_event);
4756    perf_event_remove_from_context(child_event);
4757
4758    parent_event = child_event->parent;
4759    /*
4760     * It can happen that parent exits first, and has events
4761     * that are still around due to the child reference. These
4762     * events need to be zapped - but otherwise linger.
4763     */
4764    if (parent_event) {
4765        sync_child_event(child_event, child);
4766        free_event(child_event);
4767    }
4768}
4769
4770/*
4771 * When a child task exits, feed back event values to parent events.
4772 */
4773void perf_event_exit_task(struct task_struct *child)
4774{
4775    struct perf_event *child_event, *tmp;
4776    struct perf_event_context *child_ctx;
4777    unsigned long flags;
4778
4779    if (likely(!child->perf_event_ctxp)) {
4780        perf_event_task(child, NULL, 0);
4781        return;
4782    }
4783
4784    local_irq_save(flags);
4785    /*
4786     * We can't reschedule here because interrupts are disabled,
4787     * and either child is current or it is a task that can't be
4788     * scheduled, so we are now safe from rescheduling changing
4789     * our context.
4790     */
4791    child_ctx = child->perf_event_ctxp;
4792    __perf_event_task_sched_out(child_ctx);
4793
4794    /*
4795     * Take the context lock here so that if find_get_context is
4796     * reading child->perf_event_ctxp, we wait until it has
4797     * incremented the context's refcount before we do put_ctx below.
4798     */
4799    spin_lock(&child_ctx->lock);
4800    child->perf_event_ctxp = NULL;
4801    /*
4802     * If this context is a clone; unclone it so it can't get
4803     * swapped to another process while we're removing all
4804     * the events from it.
4805     */
4806    unclone_ctx(child_ctx);
4807    spin_unlock_irqrestore(&child_ctx->lock, flags);
4808
4809    /*
4810     * Report the task dead after unscheduling the events so that we
4811     * won't get any samples after PERF_RECORD_EXIT. We can however still
4812     * get a few PERF_RECORD_READ events.
4813     */
4814    perf_event_task(child, child_ctx, 0);
4815
4816    /*
4817     * We can recurse on the same lock type through:
4818     *
4819     * __perf_event_exit_task()
4820     * sync_child_event()
4821     * fput(parent_event->filp)
4822     * perf_release()
4823     * mutex_lock(&ctx->mutex)
4824     *
4825     * But since its the parent context it won't be the same instance.
4826     */
4827    mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4828
4829again:
4830    list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4831                 group_entry)
4832        __perf_event_exit_task(child_event, child_ctx, child);
4833
4834    /*
4835     * If the last event was a group event, it will have appended all
4836     * its siblings to the list, but we obtained 'tmp' before that which
4837     * will still point to the list head terminating the iteration.
4838     */
4839    if (!list_empty(&child_ctx->group_list))
4840        goto again;
4841
4842    mutex_unlock(&child_ctx->mutex);
4843
4844    put_ctx(child_ctx);
4845}
4846
4847/*
4848 * free an unexposed, unused context as created by inheritance by
4849 * init_task below, used by fork() in case of fail.
4850 */
4851void perf_event_free_task(struct task_struct *task)
4852{
4853    struct perf_event_context *ctx = task->perf_event_ctxp;
4854    struct perf_event *event, *tmp;
4855
4856    if (!ctx)
4857        return;
4858
4859    mutex_lock(&ctx->mutex);
4860again:
4861    list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4862        struct perf_event *parent = event->parent;
4863
4864        if (WARN_ON_ONCE(!parent))
4865            continue;
4866
4867        mutex_lock(&parent->child_mutex);
4868        list_del_init(&event->child_list);
4869        mutex_unlock(&parent->child_mutex);
4870
4871        fput(parent->filp);
4872
4873        list_del_event(event, ctx);
4874        free_event(event);
4875    }
4876
4877    if (!list_empty(&ctx->group_list))
4878        goto again;
4879
4880    mutex_unlock(&ctx->mutex);
4881
4882    put_ctx(ctx);
4883}
4884
4885/*
4886 * Initialize the perf_event context in task_struct
4887 */
4888int perf_event_init_task(struct task_struct *child)
4889{
4890    struct perf_event_context *child_ctx, *parent_ctx;
4891    struct perf_event_context *cloned_ctx;
4892    struct perf_event *event;
4893    struct task_struct *parent = current;
4894    int inherited_all = 1;
4895    int ret = 0;
4896
4897    child->perf_event_ctxp = NULL;
4898
4899    mutex_init(&child->perf_event_mutex);
4900    INIT_LIST_HEAD(&child->perf_event_list);
4901
4902    if (likely(!parent->perf_event_ctxp))
4903        return 0;
4904
4905    /*
4906     * This is executed from the parent task context, so inherit
4907     * events that have been marked for cloning.
4908     * First allocate and initialize a context for the child.
4909     */
4910
4911    child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4912    if (!child_ctx)
4913        return -ENOMEM;
4914
4915    __perf_event_init_context(child_ctx, child);
4916    child->perf_event_ctxp = child_ctx;
4917    get_task_struct(child);
4918
4919    /*
4920     * If the parent's context is a clone, pin it so it won't get
4921     * swapped under us.
4922     */
4923    parent_ctx = perf_pin_task_context(parent);
4924
4925    /*
4926     * No need to check if parent_ctx != NULL here; since we saw
4927     * it non-NULL earlier, the only reason for it to become NULL
4928     * is if we exit, and since we're currently in the middle of
4929     * a fork we can't be exiting at the same time.
4930     */
4931
4932    /*
4933     * Lock the parent list. No need to lock the child - not PID
4934     * hashed yet and not running, so nobody can access it.
4935     */
4936    mutex_lock(&parent_ctx->mutex);
4937
4938    /*
4939     * We dont have to disable NMIs - we are only looking at
4940     * the list, not manipulating it:
4941     */
4942    list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4943
4944        if (!event->attr.inherit) {
4945            inherited_all = 0;
4946            continue;
4947        }
4948
4949        ret = inherit_group(event, parent, parent_ctx,
4950                         child, child_ctx);
4951        if (ret) {
4952            inherited_all = 0;
4953            break;
4954        }
4955    }
4956
4957    if (inherited_all) {
4958        /*
4959         * Mark the child context as a clone of the parent
4960         * context, or of whatever the parent is a clone of.
4961         * Note that if the parent is a clone, it could get
4962         * uncloned at any point, but that doesn't matter
4963         * because the list of events and the generation
4964         * count can't have changed since we took the mutex.
4965         */
4966        cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4967        if (cloned_ctx) {
4968            child_ctx->parent_ctx = cloned_ctx;
4969            child_ctx->parent_gen = parent_ctx->parent_gen;
4970        } else {
4971            child_ctx->parent_ctx = parent_ctx;
4972            child_ctx->parent_gen = parent_ctx->generation;
4973        }
4974        get_ctx(child_ctx->parent_ctx);
4975    }
4976
4977    mutex_unlock(&parent_ctx->mutex);
4978
4979    perf_unpin_context(parent_ctx);
4980
4981    return ret;
4982}
4983
4984static void __cpuinit perf_event_init_cpu(int cpu)
4985{
4986    struct perf_cpu_context *cpuctx;
4987
4988    cpuctx = &per_cpu(perf_cpu_context, cpu);
4989    __perf_event_init_context(&cpuctx->ctx, NULL);
4990
4991    spin_lock(&perf_resource_lock);
4992    cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4993    spin_unlock(&perf_resource_lock);
4994
4995    hw_perf_event_setup(cpu);
4996}
4997
4998#ifdef CONFIG_HOTPLUG_CPU
4999static void __perf_event_exit_cpu(void *info)
5000{
5001    struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5002    struct perf_event_context *ctx = &cpuctx->ctx;
5003    struct perf_event *event, *tmp;
5004
5005    list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
5006        __perf_event_remove_from_context(event);
5007}
5008static void perf_event_exit_cpu(int cpu)
5009{
5010    struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5011    struct perf_event_context *ctx = &cpuctx->ctx;
5012
5013    mutex_lock(&ctx->mutex);
5014    smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5015    mutex_unlock(&ctx->mutex);
5016}
5017#else
5018static inline void perf_event_exit_cpu(int cpu) { }
5019#endif
5020
5021static int __cpuinit
5022perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5023{
5024    unsigned int cpu = (long)hcpu;
5025
5026    switch (action) {
5027
5028    case CPU_UP_PREPARE:
5029    case CPU_UP_PREPARE_FROZEN:
5030        perf_event_init_cpu(cpu);
5031        break;
5032
5033    case CPU_ONLINE:
5034    case CPU_ONLINE_FROZEN:
5035        hw_perf_event_setup_online(cpu);
5036        break;
5037
5038    case CPU_DOWN_PREPARE:
5039    case CPU_DOWN_PREPARE_FROZEN:
5040        perf_event_exit_cpu(cpu);
5041        break;
5042
5043    default:
5044        break;
5045    }
5046
5047    return NOTIFY_OK;
5048}
5049
5050/*
5051 * This has to have a higher priority than migration_notifier in sched.c.
5052 */
5053static struct notifier_block __cpuinitdata perf_cpu_nb = {
5054    .notifier_call = perf_cpu_notify,
5055    .priority = 20,
5056};
5057
5058void __init perf_event_init(void)
5059{
5060    perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5061            (void *)(long)smp_processor_id());
5062    perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5063            (void *)(long)smp_processor_id());
5064    register_cpu_notifier(&perf_cpu_nb);
5065}
5066
5067static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5068{
5069    return sprintf(buf, "%d\n", perf_reserved_percpu);
5070}
5071
5072static ssize_t
5073perf_set_reserve_percpu(struct sysdev_class *class,
5074            const char *buf,
5075            size_t count)
5076{
5077    struct perf_cpu_context *cpuctx;
5078    unsigned long val;
5079    int err, cpu, mpt;
5080
5081    err = strict_strtoul(buf, 10, &val);
5082    if (err)
5083        return err;
5084    if (val > perf_max_events)
5085        return -EINVAL;
5086
5087    spin_lock(&perf_resource_lock);
5088    perf_reserved_percpu = val;
5089    for_each_online_cpu(cpu) {
5090        cpuctx = &per_cpu(perf_cpu_context, cpu);
5091        spin_lock_irq(&cpuctx->ctx.lock);
5092        mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5093              perf_max_events - perf_reserved_percpu);
5094        cpuctx->max_pertask = mpt;
5095        spin_unlock_irq(&cpuctx->ctx.lock);
5096    }
5097    spin_unlock(&perf_resource_lock);
5098
5099    return count;
5100}
5101
5102static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5103{
5104    return sprintf(buf, "%d\n", perf_overcommit);
5105}
5106
5107static ssize_t
5108perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5109{
5110    unsigned long val;
5111    int err;
5112
5113    err = strict_strtoul(buf, 10, &val);
5114    if (err)
5115        return err;
5116    if (val > 1)
5117        return -EINVAL;
5118
5119    spin_lock(&perf_resource_lock);
5120    perf_overcommit = val;
5121    spin_unlock(&perf_resource_lock);
5122
5123    return count;
5124}
5125
5126static SYSDEV_CLASS_ATTR(
5127                reserve_percpu,
5128                0644,
5129                perf_show_reserve_percpu,
5130                perf_set_reserve_percpu
5131            );
5132
5133static SYSDEV_CLASS_ATTR(
5134                overcommit,
5135                0644,
5136                perf_show_overcommit,
5137                perf_set_overcommit
5138            );
5139
5140static struct attribute *perfclass_attrs[] = {
5141    &attr_reserve_percpu.attr,
5142    &attr_overcommit.attr,
5143    NULL
5144};
5145
5146static struct attribute_group perfclass_attr_group = {
5147    .attrs = perfclass_attrs,
5148    .name = "perf_events",
5149};
5150
5151static int __init perf_event_sysfs_init(void)
5152{
5153    return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5154                  &perfclass_attr_group);
5155}
5156device_initcall(perf_event_sysfs_init);
5157

Archive Download this file



interactive