Root/kernel/events/core.c

1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/idr.h>
17#include <linux/file.h>
18#include <linux/poll.h>
19#include <linux/slab.h>
20#include <linux/hash.h>
21#include <linux/sysfs.h>
22#include <linux/dcache.h>
23#include <linux/percpu.h>
24#include <linux/ptrace.h>
25#include <linux/reboot.h>
26#include <linux/vmstat.h>
27#include <linux/device.h>
28#include <linux/export.h>
29#include <linux/vmalloc.h>
30#include <linux/hardirq.h>
31#include <linux/rculist.h>
32#include <linux/uaccess.h>
33#include <linux/syscalls.h>
34#include <linux/anon_inodes.h>
35#include <linux/kernel_stat.h>
36#include <linux/perf_event.h>
37#include <linux/ftrace_event.h>
38#include <linux/hw_breakpoint.h>
39
40#include "internal.h"
41
42#include <asm/irq_regs.h>
43
44struct remote_function_call {
45    struct task_struct *p;
46    int (*func)(void *info);
47    void *info;
48    int ret;
49};
50
51static void remote_function(void *data)
52{
53    struct remote_function_call *tfc = data;
54    struct task_struct *p = tfc->p;
55
56    if (p) {
57        tfc->ret = -EAGAIN;
58        if (task_cpu(p) != smp_processor_id() || !task_curr(p))
59            return;
60    }
61
62    tfc->ret = tfc->func(tfc->info);
63}
64
65/**
66 * task_function_call - call a function on the cpu on which a task runs
67 * @p: the task to evaluate
68 * @func: the function to be called
69 * @info: the function call argument
70 *
71 * Calls the function @func when the task is currently running. This might
72 * be on the current CPU, which just calls the function directly
73 *
74 * returns: @func return value, or
75 * -ESRCH - when the process isn't running
76 * -EAGAIN - when the process moved away
77 */
78static int
79task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
80{
81    struct remote_function_call data = {
82        .p = p,
83        .func = func,
84        .info = info,
85        .ret = -ESRCH, /* No such (running) process */
86    };
87
88    if (task_curr(p))
89        smp_call_function_single(task_cpu(p), remote_function, &data, 1);
90
91    return data.ret;
92}
93
94/**
95 * cpu_function_call - call a function on the cpu
96 * @func: the function to be called
97 * @info: the function call argument
98 *
99 * Calls the function @func on the remote cpu.
100 *
101 * returns: @func return value or -ENXIO when the cpu is offline
102 */
103static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
104{
105    struct remote_function_call data = {
106        .p = NULL,
107        .func = func,
108        .info = info,
109        .ret = -ENXIO, /* No such CPU */
110    };
111
112    smp_call_function_single(cpu, remote_function, &data, 1);
113
114    return data.ret;
115}
116
117#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
118               PERF_FLAG_FD_OUTPUT |\
119               PERF_FLAG_PID_CGROUP)
120
121/*
122 * branch priv levels that need permission checks
123 */
124#define PERF_SAMPLE_BRANCH_PERM_PLM \
125    (PERF_SAMPLE_BRANCH_KERNEL |\
126     PERF_SAMPLE_BRANCH_HV)
127
128enum event_type_t {
129    EVENT_FLEXIBLE = 0x1,
130    EVENT_PINNED = 0x2,
131    EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
132};
133
134/*
135 * perf_sched_events : >0 events exist
136 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
137 */
138struct static_key_deferred perf_sched_events __read_mostly;
139static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
140static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
141
142static atomic_t nr_mmap_events __read_mostly;
143static atomic_t nr_comm_events __read_mostly;
144static atomic_t nr_task_events __read_mostly;
145
146static LIST_HEAD(pmus);
147static DEFINE_MUTEX(pmus_lock);
148static struct srcu_struct pmus_srcu;
149
150/*
151 * perf event paranoia level:
152 * -1 - not paranoid at all
153 * 0 - disallow raw tracepoint access for unpriv
154 * 1 - disallow cpu events for unpriv
155 * 2 - disallow kernel profiling for unpriv
156 */
157int sysctl_perf_event_paranoid __read_mostly = 1;
158
159/* Minimum for 512 kiB + 1 user control page */
160int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
161
162/*
163 * max perf event sample rate
164 */
165#define DEFAULT_MAX_SAMPLE_RATE 100000
166int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
167static int max_samples_per_tick __read_mostly =
168    DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
169
170int perf_proc_update_handler(struct ctl_table *table, int write,
171        void __user *buffer, size_t *lenp,
172        loff_t *ppos)
173{
174    int ret = proc_dointvec(table, write, buffer, lenp, ppos);
175
176    if (ret || !write)
177        return ret;
178
179    max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
180
181    return 0;
182}
183
184static atomic64_t perf_event_id;
185
186static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
187                  enum event_type_t event_type);
188
189static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
190                 enum event_type_t event_type,
191                 struct task_struct *task);
192
193static void update_context_time(struct perf_event_context *ctx);
194static u64 perf_event_time(struct perf_event *event);
195
196static void ring_buffer_attach(struct perf_event *event,
197                   struct ring_buffer *rb);
198
199void __weak perf_event_print_debug(void) { }
200
201extern __weak const char *perf_pmu_name(void)
202{
203    return "pmu";
204}
205
206static inline u64 perf_clock(void)
207{
208    return local_clock();
209}
210
211static inline struct perf_cpu_context *
212__get_cpu_context(struct perf_event_context *ctx)
213{
214    return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
215}
216
217static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
218              struct perf_event_context *ctx)
219{
220    raw_spin_lock(&cpuctx->ctx.lock);
221    if (ctx)
222        raw_spin_lock(&ctx->lock);
223}
224
225static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
226                struct perf_event_context *ctx)
227{
228    if (ctx)
229        raw_spin_unlock(&ctx->lock);
230    raw_spin_unlock(&cpuctx->ctx.lock);
231}
232
233#ifdef CONFIG_CGROUP_PERF
234
235/*
236 * Must ensure cgroup is pinned (css_get) before calling
237 * this function. In other words, we cannot call this function
238 * if there is no cgroup event for the current CPU context.
239 */
240static inline struct perf_cgroup *
241perf_cgroup_from_task(struct task_struct *task)
242{
243    return container_of(task_subsys_state(task, perf_subsys_id),
244            struct perf_cgroup, css);
245}
246
247static inline bool
248perf_cgroup_match(struct perf_event *event)
249{
250    struct perf_event_context *ctx = event->ctx;
251    struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
252
253    return !event->cgrp || event->cgrp == cpuctx->cgrp;
254}
255
256static inline bool perf_tryget_cgroup(struct perf_event *event)
257{
258    return css_tryget(&event->cgrp->css);
259}
260
261static inline void perf_put_cgroup(struct perf_event *event)
262{
263    css_put(&event->cgrp->css);
264}
265
266static inline void perf_detach_cgroup(struct perf_event *event)
267{
268    perf_put_cgroup(event);
269    event->cgrp = NULL;
270}
271
272static inline int is_cgroup_event(struct perf_event *event)
273{
274    return event->cgrp != NULL;
275}
276
277static inline u64 perf_cgroup_event_time(struct perf_event *event)
278{
279    struct perf_cgroup_info *t;
280
281    t = per_cpu_ptr(event->cgrp->info, event->cpu);
282    return t->time;
283}
284
285static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
286{
287    struct perf_cgroup_info *info;
288    u64 now;
289
290    now = perf_clock();
291
292    info = this_cpu_ptr(cgrp->info);
293
294    info->time += now - info->timestamp;
295    info->timestamp = now;
296}
297
298static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
299{
300    struct perf_cgroup *cgrp_out = cpuctx->cgrp;
301    if (cgrp_out)
302        __update_cgrp_time(cgrp_out);
303}
304
305static inline void update_cgrp_time_from_event(struct perf_event *event)
306{
307    struct perf_cgroup *cgrp;
308
309    /*
310     * ensure we access cgroup data only when needed and
311     * when we know the cgroup is pinned (css_get)
312     */
313    if (!is_cgroup_event(event))
314        return;
315
316    cgrp = perf_cgroup_from_task(current);
317    /*
318     * Do not update time when cgroup is not active
319     */
320    if (cgrp == event->cgrp)
321        __update_cgrp_time(event->cgrp);
322}
323
324static inline void
325perf_cgroup_set_timestamp(struct task_struct *task,
326              struct perf_event_context *ctx)
327{
328    struct perf_cgroup *cgrp;
329    struct perf_cgroup_info *info;
330
331    /*
332     * ctx->lock held by caller
333     * ensure we do not access cgroup data
334     * unless we have the cgroup pinned (css_get)
335     */
336    if (!task || !ctx->nr_cgroups)
337        return;
338
339    cgrp = perf_cgroup_from_task(task);
340    info = this_cpu_ptr(cgrp->info);
341    info->timestamp = ctx->timestamp;
342}
343
344#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
345#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
346
347/*
348 * reschedule events based on the cgroup constraint of task.
349 *
350 * mode SWOUT : schedule out everything
351 * mode SWIN : schedule in based on cgroup for next
352 */
353void perf_cgroup_switch(struct task_struct *task, int mode)
354{
355    struct perf_cpu_context *cpuctx;
356    struct pmu *pmu;
357    unsigned long flags;
358
359    /*
360     * disable interrupts to avoid geting nr_cgroup
361     * changes via __perf_event_disable(). Also
362     * avoids preemption.
363     */
364    local_irq_save(flags);
365
366    /*
367     * we reschedule only in the presence of cgroup
368     * constrained events.
369     */
370    rcu_read_lock();
371
372    list_for_each_entry_rcu(pmu, &pmus, entry) {
373        cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
374
375        /*
376         * perf_cgroup_events says at least one
377         * context on this CPU has cgroup events.
378         *
379         * ctx->nr_cgroups reports the number of cgroup
380         * events for a context.
381         */
382        if (cpuctx->ctx.nr_cgroups > 0) {
383            perf_ctx_lock(cpuctx, cpuctx->task_ctx);
384            perf_pmu_disable(cpuctx->ctx.pmu);
385
386            if (mode & PERF_CGROUP_SWOUT) {
387                cpu_ctx_sched_out(cpuctx, EVENT_ALL);
388                /*
389                 * must not be done before ctxswout due
390                 * to event_filter_match() in event_sched_out()
391                 */
392                cpuctx->cgrp = NULL;
393            }
394
395            if (mode & PERF_CGROUP_SWIN) {
396                WARN_ON_ONCE(cpuctx->cgrp);
397                /* set cgrp before ctxsw in to
398                 * allow event_filter_match() to not
399                 * have to pass task around
400                 */
401                cpuctx->cgrp = perf_cgroup_from_task(task);
402                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
403            }
404            perf_pmu_enable(cpuctx->ctx.pmu);
405            perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
406        }
407    }
408
409    rcu_read_unlock();
410
411    local_irq_restore(flags);
412}
413
414static inline void perf_cgroup_sched_out(struct task_struct *task,
415                     struct task_struct *next)
416{
417    struct perf_cgroup *cgrp1;
418    struct perf_cgroup *cgrp2 = NULL;
419
420    /*
421     * we come here when we know perf_cgroup_events > 0
422     */
423    cgrp1 = perf_cgroup_from_task(task);
424
425    /*
426     * next is NULL when called from perf_event_enable_on_exec()
427     * that will systematically cause a cgroup_switch()
428     */
429    if (next)
430        cgrp2 = perf_cgroup_from_task(next);
431
432    /*
433     * only schedule out current cgroup events if we know
434     * that we are switching to a different cgroup. Otherwise,
435     * do no touch the cgroup events.
436     */
437    if (cgrp1 != cgrp2)
438        perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
439}
440
441static inline void perf_cgroup_sched_in(struct task_struct *prev,
442                    struct task_struct *task)
443{
444    struct perf_cgroup *cgrp1;
445    struct perf_cgroup *cgrp2 = NULL;
446
447    /*
448     * we come here when we know perf_cgroup_events > 0
449     */
450    cgrp1 = perf_cgroup_from_task(task);
451
452    /* prev can never be NULL */
453    cgrp2 = perf_cgroup_from_task(prev);
454
455    /*
456     * only need to schedule in cgroup events if we are changing
457     * cgroup during ctxsw. Cgroup events were not scheduled
458     * out of ctxsw out if that was not the case.
459     */
460    if (cgrp1 != cgrp2)
461        perf_cgroup_switch(task, PERF_CGROUP_SWIN);
462}
463
464static inline int perf_cgroup_connect(int fd, struct perf_event *event,
465                      struct perf_event_attr *attr,
466                      struct perf_event *group_leader)
467{
468    struct perf_cgroup *cgrp;
469    struct cgroup_subsys_state *css;
470    struct file *file;
471    int ret = 0, fput_needed;
472
473    file = fget_light(fd, &fput_needed);
474    if (!file)
475        return -EBADF;
476
477    css = cgroup_css_from_dir(file, perf_subsys_id);
478    if (IS_ERR(css)) {
479        ret = PTR_ERR(css);
480        goto out;
481    }
482
483    cgrp = container_of(css, struct perf_cgroup, css);
484    event->cgrp = cgrp;
485
486    /* must be done before we fput() the file */
487    if (!perf_tryget_cgroup(event)) {
488        event->cgrp = NULL;
489        ret = -ENOENT;
490        goto out;
491    }
492
493    /*
494     * all events in a group must monitor
495     * the same cgroup because a task belongs
496     * to only one perf cgroup at a time
497     */
498    if (group_leader && group_leader->cgrp != cgrp) {
499        perf_detach_cgroup(event);
500        ret = -EINVAL;
501    }
502out:
503    fput_light(file, fput_needed);
504    return ret;
505}
506
507static inline void
508perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
509{
510    struct perf_cgroup_info *t;
511    t = per_cpu_ptr(event->cgrp->info, event->cpu);
512    event->shadow_ctx_time = now - t->timestamp;
513}
514
515static inline void
516perf_cgroup_defer_enabled(struct perf_event *event)
517{
518    /*
519     * when the current task's perf cgroup does not match
520     * the event's, we need to remember to call the
521     * perf_mark_enable() function the first time a task with
522     * a matching perf cgroup is scheduled in.
523     */
524    if (is_cgroup_event(event) && !perf_cgroup_match(event))
525        event->cgrp_defer_enabled = 1;
526}
527
528static inline void
529perf_cgroup_mark_enabled(struct perf_event *event,
530             struct perf_event_context *ctx)
531{
532    struct perf_event *sub;
533    u64 tstamp = perf_event_time(event);
534
535    if (!event->cgrp_defer_enabled)
536        return;
537
538    event->cgrp_defer_enabled = 0;
539
540    event->tstamp_enabled = tstamp - event->total_time_enabled;
541    list_for_each_entry(sub, &event->sibling_list, group_entry) {
542        if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
543            sub->tstamp_enabled = tstamp - sub->total_time_enabled;
544            sub->cgrp_defer_enabled = 0;
545        }
546    }
547}
548#else /* !CONFIG_CGROUP_PERF */
549
550static inline bool
551perf_cgroup_match(struct perf_event *event)
552{
553    return true;
554}
555
556static inline void perf_detach_cgroup(struct perf_event *event)
557{}
558
559static inline int is_cgroup_event(struct perf_event *event)
560{
561    return 0;
562}
563
564static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
565{
566    return 0;
567}
568
569static inline void update_cgrp_time_from_event(struct perf_event *event)
570{
571}
572
573static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
574{
575}
576
577static inline void perf_cgroup_sched_out(struct task_struct *task,
578                     struct task_struct *next)
579{
580}
581
582static inline void perf_cgroup_sched_in(struct task_struct *prev,
583                    struct task_struct *task)
584{
585}
586
587static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
588                      struct perf_event_attr *attr,
589                      struct perf_event *group_leader)
590{
591    return -EINVAL;
592}
593
594static inline void
595perf_cgroup_set_timestamp(struct task_struct *task,
596              struct perf_event_context *ctx)
597{
598}
599
600void
601perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
602{
603}
604
605static inline void
606perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
607{
608}
609
610static inline u64 perf_cgroup_event_time(struct perf_event *event)
611{
612    return 0;
613}
614
615static inline void
616perf_cgroup_defer_enabled(struct perf_event *event)
617{
618}
619
620static inline void
621perf_cgroup_mark_enabled(struct perf_event *event,
622             struct perf_event_context *ctx)
623{
624}
625#endif
626
627void perf_pmu_disable(struct pmu *pmu)
628{
629    int *count = this_cpu_ptr(pmu->pmu_disable_count);
630    if (!(*count)++)
631        pmu->pmu_disable(pmu);
632}
633
634void perf_pmu_enable(struct pmu *pmu)
635{
636    int *count = this_cpu_ptr(pmu->pmu_disable_count);
637    if (!--(*count))
638        pmu->pmu_enable(pmu);
639}
640
641static DEFINE_PER_CPU(struct list_head, rotation_list);
642
643/*
644 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
645 * because they're strictly cpu affine and rotate_start is called with IRQs
646 * disabled, while rotate_context is called from IRQ context.
647 */
648static void perf_pmu_rotate_start(struct pmu *pmu)
649{
650    struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
651    struct list_head *head = &__get_cpu_var(rotation_list);
652
653    WARN_ON(!irqs_disabled());
654
655    if (list_empty(&cpuctx->rotation_list))
656        list_add(&cpuctx->rotation_list, head);
657}
658
659static void get_ctx(struct perf_event_context *ctx)
660{
661    WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
662}
663
664static void put_ctx(struct perf_event_context *ctx)
665{
666    if (atomic_dec_and_test(&ctx->refcount)) {
667        if (ctx->parent_ctx)
668            put_ctx(ctx->parent_ctx);
669        if (ctx->task)
670            put_task_struct(ctx->task);
671        kfree_rcu(ctx, rcu_head);
672    }
673}
674
675static void unclone_ctx(struct perf_event_context *ctx)
676{
677    if (ctx->parent_ctx) {
678        put_ctx(ctx->parent_ctx);
679        ctx->parent_ctx = NULL;
680    }
681}
682
683static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
684{
685    /*
686     * only top level events have the pid namespace they were created in
687     */
688    if (event->parent)
689        event = event->parent;
690
691    return task_tgid_nr_ns(p, event->ns);
692}
693
694static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
695{
696    /*
697     * only top level events have the pid namespace they were created in
698     */
699    if (event->parent)
700        event = event->parent;
701
702    return task_pid_nr_ns(p, event->ns);
703}
704
705/*
706 * If we inherit events we want to return the parent event id
707 * to userspace.
708 */
709static u64 primary_event_id(struct perf_event *event)
710{
711    u64 id = event->id;
712
713    if (event->parent)
714        id = event->parent->id;
715
716    return id;
717}
718
719/*
720 * Get the perf_event_context for a task and lock it.
721 * This has to cope with with the fact that until it is locked,
722 * the context could get moved to another task.
723 */
724static struct perf_event_context *
725perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
726{
727    struct perf_event_context *ctx;
728
729    rcu_read_lock();
730retry:
731    ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
732    if (ctx) {
733        /*
734         * If this context is a clone of another, it might
735         * get swapped for another underneath us by
736         * perf_event_task_sched_out, though the
737         * rcu_read_lock() protects us from any context
738         * getting freed. Lock the context and check if it
739         * got swapped before we could get the lock, and retry
740         * if so. If we locked the right context, then it
741         * can't get swapped on us any more.
742         */
743        raw_spin_lock_irqsave(&ctx->lock, *flags);
744        if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
745            raw_spin_unlock_irqrestore(&ctx->lock, *flags);
746            goto retry;
747        }
748
749        if (!atomic_inc_not_zero(&ctx->refcount)) {
750            raw_spin_unlock_irqrestore(&ctx->lock, *flags);
751            ctx = NULL;
752        }
753    }
754    rcu_read_unlock();
755    return ctx;
756}
757
758/*
759 * Get the context for a task and increment its pin_count so it
760 * can't get swapped to another task. This also increments its
761 * reference count so that the context can't get freed.
762 */
763static struct perf_event_context *
764perf_pin_task_context(struct task_struct *task, int ctxn)
765{
766    struct perf_event_context *ctx;
767    unsigned long flags;
768
769    ctx = perf_lock_task_context(task, ctxn, &flags);
770    if (ctx) {
771        ++ctx->pin_count;
772        raw_spin_unlock_irqrestore(&ctx->lock, flags);
773    }
774    return ctx;
775}
776
777static void perf_unpin_context(struct perf_event_context *ctx)
778{
779    unsigned long flags;
780
781    raw_spin_lock_irqsave(&ctx->lock, flags);
782    --ctx->pin_count;
783    raw_spin_unlock_irqrestore(&ctx->lock, flags);
784}
785
786/*
787 * Update the record of the current time in a context.
788 */
789static void update_context_time(struct perf_event_context *ctx)
790{
791    u64 now = perf_clock();
792
793    ctx->time += now - ctx->timestamp;
794    ctx->timestamp = now;
795}
796
797static u64 perf_event_time(struct perf_event *event)
798{
799    struct perf_event_context *ctx = event->ctx;
800
801    if (is_cgroup_event(event))
802        return perf_cgroup_event_time(event);
803
804    return ctx ? ctx->time : 0;
805}
806
807/*
808 * Update the total_time_enabled and total_time_running fields for a event.
809 * The caller of this function needs to hold the ctx->lock.
810 */
811static void update_event_times(struct perf_event *event)
812{
813    struct perf_event_context *ctx = event->ctx;
814    u64 run_end;
815
816    if (event->state < PERF_EVENT_STATE_INACTIVE ||
817        event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
818        return;
819    /*
820     * in cgroup mode, time_enabled represents
821     * the time the event was enabled AND active
822     * tasks were in the monitored cgroup. This is
823     * independent of the activity of the context as
824     * there may be a mix of cgroup and non-cgroup events.
825     *
826     * That is why we treat cgroup events differently
827     * here.
828     */
829    if (is_cgroup_event(event))
830        run_end = perf_cgroup_event_time(event);
831    else if (ctx->is_active)
832        run_end = ctx->time;
833    else
834        run_end = event->tstamp_stopped;
835
836    event->total_time_enabled = run_end - event->tstamp_enabled;
837
838    if (event->state == PERF_EVENT_STATE_INACTIVE)
839        run_end = event->tstamp_stopped;
840    else
841        run_end = perf_event_time(event);
842
843    event->total_time_running = run_end - event->tstamp_running;
844
845}
846
847/*
848 * Update total_time_enabled and total_time_running for all events in a group.
849 */
850static void update_group_times(struct perf_event *leader)
851{
852    struct perf_event *event;
853
854    update_event_times(leader);
855    list_for_each_entry(event, &leader->sibling_list, group_entry)
856        update_event_times(event);
857}
858
859static struct list_head *
860ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
861{
862    if (event->attr.pinned)
863        return &ctx->pinned_groups;
864    else
865        return &ctx->flexible_groups;
866}
867
868/*
869 * Add a event from the lists for its context.
870 * Must be called with ctx->mutex and ctx->lock held.
871 */
872static void
873list_add_event(struct perf_event *event, struct perf_event_context *ctx)
874{
875    WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
876    event->attach_state |= PERF_ATTACH_CONTEXT;
877
878    /*
879     * If we're a stand alone event or group leader, we go to the context
880     * list, group events are kept attached to the group so that
881     * perf_group_detach can, at all times, locate all siblings.
882     */
883    if (event->group_leader == event) {
884        struct list_head *list;
885
886        if (is_software_event(event))
887            event->group_flags |= PERF_GROUP_SOFTWARE;
888
889        list = ctx_group_list(event, ctx);
890        list_add_tail(&event->group_entry, list);
891    }
892
893    if (is_cgroup_event(event))
894        ctx->nr_cgroups++;
895
896    if (has_branch_stack(event))
897        ctx->nr_branch_stack++;
898
899    list_add_rcu(&event->event_entry, &ctx->event_list);
900    if (!ctx->nr_events)
901        perf_pmu_rotate_start(ctx->pmu);
902    ctx->nr_events++;
903    if (event->attr.inherit_stat)
904        ctx->nr_stat++;
905}
906
907/*
908 * Called at perf_event creation and when events are attached/detached from a
909 * group.
910 */
911static void perf_event__read_size(struct perf_event *event)
912{
913    int entry = sizeof(u64); /* value */
914    int size = 0;
915    int nr = 1;
916
917    if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
918        size += sizeof(u64);
919
920    if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
921        size += sizeof(u64);
922
923    if (event->attr.read_format & PERF_FORMAT_ID)
924        entry += sizeof(u64);
925
926    if (event->attr.read_format & PERF_FORMAT_GROUP) {
927        nr += event->group_leader->nr_siblings;
928        size += sizeof(u64);
929    }
930
931    size += entry * nr;
932    event->read_size = size;
933}
934
935static void perf_event__header_size(struct perf_event *event)
936{
937    struct perf_sample_data *data;
938    u64 sample_type = event->attr.sample_type;
939    u16 size = 0;
940
941    perf_event__read_size(event);
942
943    if (sample_type & PERF_SAMPLE_IP)
944        size += sizeof(data->ip);
945
946    if (sample_type & PERF_SAMPLE_ADDR)
947        size += sizeof(data->addr);
948
949    if (sample_type & PERF_SAMPLE_PERIOD)
950        size += sizeof(data->period);
951
952    if (sample_type & PERF_SAMPLE_READ)
953        size += event->read_size;
954
955    event->header_size = size;
956}
957
958static void perf_event__id_header_size(struct perf_event *event)
959{
960    struct perf_sample_data *data;
961    u64 sample_type = event->attr.sample_type;
962    u16 size = 0;
963
964    if (sample_type & PERF_SAMPLE_TID)
965        size += sizeof(data->tid_entry);
966
967    if (sample_type & PERF_SAMPLE_TIME)
968        size += sizeof(data->time);
969
970    if (sample_type & PERF_SAMPLE_ID)
971        size += sizeof(data->id);
972
973    if (sample_type & PERF_SAMPLE_STREAM_ID)
974        size += sizeof(data->stream_id);
975
976    if (sample_type & PERF_SAMPLE_CPU)
977        size += sizeof(data->cpu_entry);
978
979    event->id_header_size = size;
980}
981
982static void perf_group_attach(struct perf_event *event)
983{
984    struct perf_event *group_leader = event->group_leader, *pos;
985
986    /*
987     * We can have double attach due to group movement in perf_event_open.
988     */
989    if (event->attach_state & PERF_ATTACH_GROUP)
990        return;
991
992    event->attach_state |= PERF_ATTACH_GROUP;
993
994    if (group_leader == event)
995        return;
996
997    if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
998            !is_software_event(event))
999        group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1000
1001    list_add_tail(&event->group_entry, &group_leader->sibling_list);
1002    group_leader->nr_siblings++;
1003
1004    perf_event__header_size(group_leader);
1005
1006    list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1007        perf_event__header_size(pos);
1008}
1009
1010/*
1011 * Remove a event from the lists for its context.
1012 * Must be called with ctx->mutex and ctx->lock held.
1013 */
1014static void
1015list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1016{
1017    struct perf_cpu_context *cpuctx;
1018    /*
1019     * We can have double detach due to exit/hot-unplug + close.
1020     */
1021    if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1022        return;
1023
1024    event->attach_state &= ~PERF_ATTACH_CONTEXT;
1025
1026    if (is_cgroup_event(event)) {
1027        ctx->nr_cgroups--;
1028        cpuctx = __get_cpu_context(ctx);
1029        /*
1030         * if there are no more cgroup events
1031         * then cler cgrp to avoid stale pointer
1032         * in update_cgrp_time_from_cpuctx()
1033         */
1034        if (!ctx->nr_cgroups)
1035            cpuctx->cgrp = NULL;
1036    }
1037
1038    if (has_branch_stack(event))
1039        ctx->nr_branch_stack--;
1040
1041    ctx->nr_events--;
1042    if (event->attr.inherit_stat)
1043        ctx->nr_stat--;
1044
1045    list_del_rcu(&event->event_entry);
1046
1047    if (event->group_leader == event)
1048        list_del_init(&event->group_entry);
1049
1050    update_group_times(event);
1051
1052    /*
1053     * If event was in error state, then keep it
1054     * that way, otherwise bogus counts will be
1055     * returned on read(). The only way to get out
1056     * of error state is by explicit re-enabling
1057     * of the event
1058     */
1059    if (event->state > PERF_EVENT_STATE_OFF)
1060        event->state = PERF_EVENT_STATE_OFF;
1061}
1062
1063static void perf_group_detach(struct perf_event *event)
1064{
1065    struct perf_event *sibling, *tmp;
1066    struct list_head *list = NULL;
1067
1068    /*
1069     * We can have double detach due to exit/hot-unplug + close.
1070     */
1071    if (!(event->attach_state & PERF_ATTACH_GROUP))
1072        return;
1073
1074    event->attach_state &= ~PERF_ATTACH_GROUP;
1075
1076    /*
1077     * If this is a sibling, remove it from its group.
1078     */
1079    if (event->group_leader != event) {
1080        list_del_init(&event->group_entry);
1081        event->group_leader->nr_siblings--;
1082        goto out;
1083    }
1084
1085    if (!list_empty(&event->group_entry))
1086        list = &event->group_entry;
1087
1088    /*
1089     * If this was a group event with sibling events then
1090     * upgrade the siblings to singleton events by adding them
1091     * to whatever list we are on.
1092     */
1093    list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1094        if (list)
1095            list_move_tail(&sibling->group_entry, list);
1096        sibling->group_leader = sibling;
1097
1098        /* Inherit group flags from the previous leader */
1099        sibling->group_flags = event->group_flags;
1100    }
1101
1102out:
1103    perf_event__header_size(event->group_leader);
1104
1105    list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1106        perf_event__header_size(tmp);
1107}
1108
1109static inline int
1110event_filter_match(struct perf_event *event)
1111{
1112    return (event->cpu == -1 || event->cpu == smp_processor_id())
1113        && perf_cgroup_match(event);
1114}
1115
1116static void
1117event_sched_out(struct perf_event *event,
1118          struct perf_cpu_context *cpuctx,
1119          struct perf_event_context *ctx)
1120{
1121    u64 tstamp = perf_event_time(event);
1122    u64 delta;
1123    /*
1124     * An event which could not be activated because of
1125     * filter mismatch still needs to have its timings
1126     * maintained, otherwise bogus information is return
1127     * via read() for time_enabled, time_running:
1128     */
1129    if (event->state == PERF_EVENT_STATE_INACTIVE
1130        && !event_filter_match(event)) {
1131        delta = tstamp - event->tstamp_stopped;
1132        event->tstamp_running += delta;
1133        event->tstamp_stopped = tstamp;
1134    }
1135
1136    if (event->state != PERF_EVENT_STATE_ACTIVE)
1137        return;
1138
1139    event->state = PERF_EVENT_STATE_INACTIVE;
1140    if (event->pending_disable) {
1141        event->pending_disable = 0;
1142        event->state = PERF_EVENT_STATE_OFF;
1143    }
1144    event->tstamp_stopped = tstamp;
1145    event->pmu->del(event, 0);
1146    event->oncpu = -1;
1147
1148    if (!is_software_event(event))
1149        cpuctx->active_oncpu--;
1150    ctx->nr_active--;
1151    if (event->attr.freq && event->attr.sample_freq)
1152        ctx->nr_freq--;
1153    if (event->attr.exclusive || !cpuctx->active_oncpu)
1154        cpuctx->exclusive = 0;
1155}
1156
1157static void
1158group_sched_out(struct perf_event *group_event,
1159        struct perf_cpu_context *cpuctx,
1160        struct perf_event_context *ctx)
1161{
1162    struct perf_event *event;
1163    int state = group_event->state;
1164
1165    event_sched_out(group_event, cpuctx, ctx);
1166
1167    /*
1168     * Schedule out siblings (if any):
1169     */
1170    list_for_each_entry(event, &group_event->sibling_list, group_entry)
1171        event_sched_out(event, cpuctx, ctx);
1172
1173    if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1174        cpuctx->exclusive = 0;
1175}
1176
1177/*
1178 * Cross CPU call to remove a performance event
1179 *
1180 * We disable the event on the hardware level first. After that we
1181 * remove it from the context list.
1182 */
1183static int __perf_remove_from_context(void *info)
1184{
1185    struct perf_event *event = info;
1186    struct perf_event_context *ctx = event->ctx;
1187    struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1188
1189    raw_spin_lock(&ctx->lock);
1190    event_sched_out(event, cpuctx, ctx);
1191    list_del_event(event, ctx);
1192    if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1193        ctx->is_active = 0;
1194        cpuctx->task_ctx = NULL;
1195    }
1196    raw_spin_unlock(&ctx->lock);
1197
1198    return 0;
1199}
1200
1201
1202/*
1203 * Remove the event from a task's (or a CPU's) list of events.
1204 *
1205 * CPU events are removed with a smp call. For task events we only
1206 * call when the task is on a CPU.
1207 *
1208 * If event->ctx is a cloned context, callers must make sure that
1209 * every task struct that event->ctx->task could possibly point to
1210 * remains valid. This is OK when called from perf_release since
1211 * that only calls us on the top-level context, which can't be a clone.
1212 * When called from perf_event_exit_task, it's OK because the
1213 * context has been detached from its task.
1214 */
1215static void perf_remove_from_context(struct perf_event *event)
1216{
1217    struct perf_event_context *ctx = event->ctx;
1218    struct task_struct *task = ctx->task;
1219
1220    lockdep_assert_held(&ctx->mutex);
1221
1222    if (!task) {
1223        /*
1224         * Per cpu events are removed via an smp call and
1225         * the removal is always successful.
1226         */
1227        cpu_function_call(event->cpu, __perf_remove_from_context, event);
1228        return;
1229    }
1230
1231retry:
1232    if (!task_function_call(task, __perf_remove_from_context, event))
1233        return;
1234
1235    raw_spin_lock_irq(&ctx->lock);
1236    /*
1237     * If we failed to find a running task, but find the context active now
1238     * that we've acquired the ctx->lock, retry.
1239     */
1240    if (ctx->is_active) {
1241        raw_spin_unlock_irq(&ctx->lock);
1242        goto retry;
1243    }
1244
1245    /*
1246     * Since the task isn't running, its safe to remove the event, us
1247     * holding the ctx->lock ensures the task won't get scheduled in.
1248     */
1249    list_del_event(event, ctx);
1250    raw_spin_unlock_irq(&ctx->lock);
1251}
1252
1253/*
1254 * Cross CPU call to disable a performance event
1255 */
1256static int __perf_event_disable(void *info)
1257{
1258    struct perf_event *event = info;
1259    struct perf_event_context *ctx = event->ctx;
1260    struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1261
1262    /*
1263     * If this is a per-task event, need to check whether this
1264     * event's task is the current task on this cpu.
1265     *
1266     * Can trigger due to concurrent perf_event_context_sched_out()
1267     * flipping contexts around.
1268     */
1269    if (ctx->task && cpuctx->task_ctx != ctx)
1270        return -EINVAL;
1271
1272    raw_spin_lock(&ctx->lock);
1273
1274    /*
1275     * If the event is on, turn it off.
1276     * If it is in error state, leave it in error state.
1277     */
1278    if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1279        update_context_time(ctx);
1280        update_cgrp_time_from_event(event);
1281        update_group_times(event);
1282        if (event == event->group_leader)
1283            group_sched_out(event, cpuctx, ctx);
1284        else
1285            event_sched_out(event, cpuctx, ctx);
1286        event->state = PERF_EVENT_STATE_OFF;
1287    }
1288
1289    raw_spin_unlock(&ctx->lock);
1290
1291    return 0;
1292}
1293
1294/*
1295 * Disable a event.
1296 *
1297 * If event->ctx is a cloned context, callers must make sure that
1298 * every task struct that event->ctx->task could possibly point to
1299 * remains valid. This condition is satisifed when called through
1300 * perf_event_for_each_child or perf_event_for_each because they
1301 * hold the top-level event's child_mutex, so any descendant that
1302 * goes to exit will block in sync_child_event.
1303 * When called from perf_pending_event it's OK because event->ctx
1304 * is the current context on this CPU and preemption is disabled,
1305 * hence we can't get into perf_event_task_sched_out for this context.
1306 */
1307void perf_event_disable(struct perf_event *event)
1308{
1309    struct perf_event_context *ctx = event->ctx;
1310    struct task_struct *task = ctx->task;
1311
1312    if (!task) {
1313        /*
1314         * Disable the event on the cpu that it's on
1315         */
1316        cpu_function_call(event->cpu, __perf_event_disable, event);
1317        return;
1318    }
1319
1320retry:
1321    if (!task_function_call(task, __perf_event_disable, event))
1322        return;
1323
1324    raw_spin_lock_irq(&ctx->lock);
1325    /*
1326     * If the event is still active, we need to retry the cross-call.
1327     */
1328    if (event->state == PERF_EVENT_STATE_ACTIVE) {
1329        raw_spin_unlock_irq(&ctx->lock);
1330        /*
1331         * Reload the task pointer, it might have been changed by
1332         * a concurrent perf_event_context_sched_out().
1333         */
1334        task = ctx->task;
1335        goto retry;
1336    }
1337
1338    /*
1339     * Since we have the lock this context can't be scheduled
1340     * in, so we can change the state safely.
1341     */
1342    if (event->state == PERF_EVENT_STATE_INACTIVE) {
1343        update_group_times(event);
1344        event->state = PERF_EVENT_STATE_OFF;
1345    }
1346    raw_spin_unlock_irq(&ctx->lock);
1347}
1348EXPORT_SYMBOL_GPL(perf_event_disable);
1349
1350static void perf_set_shadow_time(struct perf_event *event,
1351                 struct perf_event_context *ctx,
1352                 u64 tstamp)
1353{
1354    /*
1355     * use the correct time source for the time snapshot
1356     *
1357     * We could get by without this by leveraging the
1358     * fact that to get to this function, the caller
1359     * has most likely already called update_context_time()
1360     * and update_cgrp_time_xx() and thus both timestamp
1361     * are identical (or very close). Given that tstamp is,
1362     * already adjusted for cgroup, we could say that:
1363     * tstamp - ctx->timestamp
1364     * is equivalent to
1365     * tstamp - cgrp->timestamp.
1366     *
1367     * Then, in perf_output_read(), the calculation would
1368     * work with no changes because:
1369     * - event is guaranteed scheduled in
1370     * - no scheduled out in between
1371     * - thus the timestamp would be the same
1372     *
1373     * But this is a bit hairy.
1374     *
1375     * So instead, we have an explicit cgroup call to remain
1376     * within the time time source all along. We believe it
1377     * is cleaner and simpler to understand.
1378     */
1379    if (is_cgroup_event(event))
1380        perf_cgroup_set_shadow_time(event, tstamp);
1381    else
1382        event->shadow_ctx_time = tstamp - ctx->timestamp;
1383}
1384
1385#define MAX_INTERRUPTS (~0ULL)
1386
1387static void perf_log_throttle(struct perf_event *event, int enable);
1388
1389static int
1390event_sched_in(struct perf_event *event,
1391         struct perf_cpu_context *cpuctx,
1392         struct perf_event_context *ctx)
1393{
1394    u64 tstamp = perf_event_time(event);
1395
1396    if (event->state <= PERF_EVENT_STATE_OFF)
1397        return 0;
1398
1399    event->state = PERF_EVENT_STATE_ACTIVE;
1400    event->oncpu = smp_processor_id();
1401
1402    /*
1403     * Unthrottle events, since we scheduled we might have missed several
1404     * ticks already, also for a heavily scheduling task there is little
1405     * guarantee it'll get a tick in a timely manner.
1406     */
1407    if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1408        perf_log_throttle(event, 1);
1409        event->hw.interrupts = 0;
1410    }
1411
1412    /*
1413     * The new state must be visible before we turn it on in the hardware:
1414     */
1415    smp_wmb();
1416
1417    if (event->pmu->add(event, PERF_EF_START)) {
1418        event->state = PERF_EVENT_STATE_INACTIVE;
1419        event->oncpu = -1;
1420        return -EAGAIN;
1421    }
1422
1423    event->tstamp_running += tstamp - event->tstamp_stopped;
1424
1425    perf_set_shadow_time(event, ctx, tstamp);
1426
1427    if (!is_software_event(event))
1428        cpuctx->active_oncpu++;
1429    ctx->nr_active++;
1430    if (event->attr.freq && event->attr.sample_freq)
1431        ctx->nr_freq++;
1432
1433    if (event->attr.exclusive)
1434        cpuctx->exclusive = 1;
1435
1436    return 0;
1437}
1438
1439static int
1440group_sched_in(struct perf_event *group_event,
1441           struct perf_cpu_context *cpuctx,
1442           struct perf_event_context *ctx)
1443{
1444    struct perf_event *event, *partial_group = NULL;
1445    struct pmu *pmu = group_event->pmu;
1446    u64 now = ctx->time;
1447    bool simulate = false;
1448
1449    if (group_event->state == PERF_EVENT_STATE_OFF)
1450        return 0;
1451
1452    pmu->start_txn(pmu);
1453
1454    if (event_sched_in(group_event, cpuctx, ctx)) {
1455        pmu->cancel_txn(pmu);
1456        return -EAGAIN;
1457    }
1458
1459    /*
1460     * Schedule in siblings as one group (if any):
1461     */
1462    list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1463        if (event_sched_in(event, cpuctx, ctx)) {
1464            partial_group = event;
1465            goto group_error;
1466        }
1467    }
1468
1469    if (!pmu->commit_txn(pmu))
1470        return 0;
1471
1472group_error:
1473    /*
1474     * Groups can be scheduled in as one unit only, so undo any
1475     * partial group before returning:
1476     * The events up to the failed event are scheduled out normally,
1477     * tstamp_stopped will be updated.
1478     *
1479     * The failed events and the remaining siblings need to have
1480     * their timings updated as if they had gone thru event_sched_in()
1481     * and event_sched_out(). This is required to get consistent timings
1482     * across the group. This also takes care of the case where the group
1483     * could never be scheduled by ensuring tstamp_stopped is set to mark
1484     * the time the event was actually stopped, such that time delta
1485     * calculation in update_event_times() is correct.
1486     */
1487    list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1488        if (event == partial_group)
1489            simulate = true;
1490
1491        if (simulate) {
1492            event->tstamp_running += now - event->tstamp_stopped;
1493            event->tstamp_stopped = now;
1494        } else {
1495            event_sched_out(event, cpuctx, ctx);
1496        }
1497    }
1498    event_sched_out(group_event, cpuctx, ctx);
1499
1500    pmu->cancel_txn(pmu);
1501
1502    return -EAGAIN;
1503}
1504
1505/*
1506 * Work out whether we can put this event group on the CPU now.
1507 */
1508static int group_can_go_on(struct perf_event *event,
1509               struct perf_cpu_context *cpuctx,
1510               int can_add_hw)
1511{
1512    /*
1513     * Groups consisting entirely of software events can always go on.
1514     */
1515    if (event->group_flags & PERF_GROUP_SOFTWARE)
1516        return 1;
1517    /*
1518     * If an exclusive group is already on, no other hardware
1519     * events can go on.
1520     */
1521    if (cpuctx->exclusive)
1522        return 0;
1523    /*
1524     * If this group is exclusive and there are already
1525     * events on the CPU, it can't go on.
1526     */
1527    if (event->attr.exclusive && cpuctx->active_oncpu)
1528        return 0;
1529    /*
1530     * Otherwise, try to add it if all previous groups were able
1531     * to go on.
1532     */
1533    return can_add_hw;
1534}
1535
1536static void add_event_to_ctx(struct perf_event *event,
1537                   struct perf_event_context *ctx)
1538{
1539    u64 tstamp = perf_event_time(event);
1540
1541    list_add_event(event, ctx);
1542    perf_group_attach(event);
1543    event->tstamp_enabled = tstamp;
1544    event->tstamp_running = tstamp;
1545    event->tstamp_stopped = tstamp;
1546}
1547
1548static void task_ctx_sched_out(struct perf_event_context *ctx);
1549static void
1550ctx_sched_in(struct perf_event_context *ctx,
1551         struct perf_cpu_context *cpuctx,
1552         enum event_type_t event_type,
1553         struct task_struct *task);
1554
1555static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1556                struct perf_event_context *ctx,
1557                struct task_struct *task)
1558{
1559    cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1560    if (ctx)
1561        ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1562    cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1563    if (ctx)
1564        ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1565}
1566
1567/*
1568 * Cross CPU call to install and enable a performance event
1569 *
1570 * Must be called with ctx->mutex held
1571 */
1572static int __perf_install_in_context(void *info)
1573{
1574    struct perf_event *event = info;
1575    struct perf_event_context *ctx = event->ctx;
1576    struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1577    struct perf_event_context *task_ctx = cpuctx->task_ctx;
1578    struct task_struct *task = current;
1579
1580    perf_ctx_lock(cpuctx, task_ctx);
1581    perf_pmu_disable(cpuctx->ctx.pmu);
1582
1583    /*
1584     * If there was an active task_ctx schedule it out.
1585     */
1586    if (task_ctx)
1587        task_ctx_sched_out(task_ctx);
1588
1589    /*
1590     * If the context we're installing events in is not the
1591     * active task_ctx, flip them.
1592     */
1593    if (ctx->task && task_ctx != ctx) {
1594        if (task_ctx)
1595            raw_spin_unlock(&task_ctx->lock);
1596        raw_spin_lock(&ctx->lock);
1597        task_ctx = ctx;
1598    }
1599
1600    if (task_ctx) {
1601        cpuctx->task_ctx = task_ctx;
1602        task = task_ctx->task;
1603    }
1604
1605    cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1606
1607    update_context_time(ctx);
1608    /*
1609     * update cgrp time only if current cgrp
1610     * matches event->cgrp. Must be done before
1611     * calling add_event_to_ctx()
1612     */
1613    update_cgrp_time_from_event(event);
1614
1615    add_event_to_ctx(event, ctx);
1616
1617    /*
1618     * Schedule everything back in
1619     */
1620    perf_event_sched_in(cpuctx, task_ctx, task);
1621
1622    perf_pmu_enable(cpuctx->ctx.pmu);
1623    perf_ctx_unlock(cpuctx, task_ctx);
1624
1625    return 0;
1626}
1627
1628/*
1629 * Attach a performance event to a context
1630 *
1631 * First we add the event to the list with the hardware enable bit
1632 * in event->hw_config cleared.
1633 *
1634 * If the event is attached to a task which is on a CPU we use a smp
1635 * call to enable it in the task context. The task might have been
1636 * scheduled away, but we check this in the smp call again.
1637 */
1638static void
1639perf_install_in_context(struct perf_event_context *ctx,
1640            struct perf_event *event,
1641            int cpu)
1642{
1643    struct task_struct *task = ctx->task;
1644
1645    lockdep_assert_held(&ctx->mutex);
1646
1647    event->ctx = ctx;
1648
1649    if (!task) {
1650        /*
1651         * Per cpu events are installed via an smp call and
1652         * the install is always successful.
1653         */
1654        cpu_function_call(cpu, __perf_install_in_context, event);
1655        return;
1656    }
1657
1658retry:
1659    if (!task_function_call(task, __perf_install_in_context, event))
1660        return;
1661
1662    raw_spin_lock_irq(&ctx->lock);
1663    /*
1664     * If we failed to find a running task, but find the context active now
1665     * that we've acquired the ctx->lock, retry.
1666     */
1667    if (ctx->is_active) {
1668        raw_spin_unlock_irq(&ctx->lock);
1669        goto retry;
1670    }
1671
1672    /*
1673     * Since the task isn't running, its safe to add the event, us holding
1674     * the ctx->lock ensures the task won't get scheduled in.
1675     */
1676    add_event_to_ctx(event, ctx);
1677    raw_spin_unlock_irq(&ctx->lock);
1678}
1679
1680/*
1681 * Put a event into inactive state and update time fields.
1682 * Enabling the leader of a group effectively enables all
1683 * the group members that aren't explicitly disabled, so we
1684 * have to update their ->tstamp_enabled also.
1685 * Note: this works for group members as well as group leaders
1686 * since the non-leader members' sibling_lists will be empty.
1687 */
1688static void __perf_event_mark_enabled(struct perf_event *event)
1689{
1690    struct perf_event *sub;
1691    u64 tstamp = perf_event_time(event);
1692
1693    event->state = PERF_EVENT_STATE_INACTIVE;
1694    event->tstamp_enabled = tstamp - event->total_time_enabled;
1695    list_for_each_entry(sub, &event->sibling_list, group_entry) {
1696        if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1697            sub->tstamp_enabled = tstamp - sub->total_time_enabled;
1698    }
1699}
1700
1701/*
1702 * Cross CPU call to enable a performance event
1703 */
1704static int __perf_event_enable(void *info)
1705{
1706    struct perf_event *event = info;
1707    struct perf_event_context *ctx = event->ctx;
1708    struct perf_event *leader = event->group_leader;
1709    struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1710    int err;
1711
1712    if (WARN_ON_ONCE(!ctx->is_active))
1713        return -EINVAL;
1714
1715    raw_spin_lock(&ctx->lock);
1716    update_context_time(ctx);
1717
1718    if (event->state >= PERF_EVENT_STATE_INACTIVE)
1719        goto unlock;
1720
1721    /*
1722     * set current task's cgroup time reference point
1723     */
1724    perf_cgroup_set_timestamp(current, ctx);
1725
1726    __perf_event_mark_enabled(event);
1727
1728    if (!event_filter_match(event)) {
1729        if (is_cgroup_event(event))
1730            perf_cgroup_defer_enabled(event);
1731        goto unlock;
1732    }
1733
1734    /*
1735     * If the event is in a group and isn't the group leader,
1736     * then don't put it on unless the group is on.
1737     */
1738    if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1739        goto unlock;
1740
1741    if (!group_can_go_on(event, cpuctx, 1)) {
1742        err = -EEXIST;
1743    } else {
1744        if (event == leader)
1745            err = group_sched_in(event, cpuctx, ctx);
1746        else
1747            err = event_sched_in(event, cpuctx, ctx);
1748    }
1749
1750    if (err) {
1751        /*
1752         * If this event can't go on and it's part of a
1753         * group, then the whole group has to come off.
1754         */
1755        if (leader != event)
1756            group_sched_out(leader, cpuctx, ctx);
1757        if (leader->attr.pinned) {
1758            update_group_times(leader);
1759            leader->state = PERF_EVENT_STATE_ERROR;
1760        }
1761    }
1762
1763unlock:
1764    raw_spin_unlock(&ctx->lock);
1765
1766    return 0;
1767}
1768
1769/*
1770 * Enable a event.
1771 *
1772 * If event->ctx is a cloned context, callers must make sure that
1773 * every task struct that event->ctx->task could possibly point to
1774 * remains valid. This condition is satisfied when called through
1775 * perf_event_for_each_child or perf_event_for_each as described
1776 * for perf_event_disable.
1777 */
1778void perf_event_enable(struct perf_event *event)
1779{
1780    struct perf_event_context *ctx = event->ctx;
1781    struct task_struct *task = ctx->task;
1782
1783    if (!task) {
1784        /*
1785         * Enable the event on the cpu that it's on
1786         */
1787        cpu_function_call(event->cpu, __perf_event_enable, event);
1788        return;
1789    }
1790
1791    raw_spin_lock_irq(&ctx->lock);
1792    if (event->state >= PERF_EVENT_STATE_INACTIVE)
1793        goto out;
1794
1795    /*
1796     * If the event is in error state, clear that first.
1797     * That way, if we see the event in error state below, we
1798     * know that it has gone back into error state, as distinct
1799     * from the task having been scheduled away before the
1800     * cross-call arrived.
1801     */
1802    if (event->state == PERF_EVENT_STATE_ERROR)
1803        event->state = PERF_EVENT_STATE_OFF;
1804
1805retry:
1806    if (!ctx->is_active) {
1807        __perf_event_mark_enabled(event);
1808        goto out;
1809    }
1810
1811    raw_spin_unlock_irq(&ctx->lock);
1812
1813    if (!task_function_call(task, __perf_event_enable, event))
1814        return;
1815
1816    raw_spin_lock_irq(&ctx->lock);
1817
1818    /*
1819     * If the context is active and the event is still off,
1820     * we need to retry the cross-call.
1821     */
1822    if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1823        /*
1824         * task could have been flipped by a concurrent
1825         * perf_event_context_sched_out()
1826         */
1827        task = ctx->task;
1828        goto retry;
1829    }
1830
1831out:
1832    raw_spin_unlock_irq(&ctx->lock);
1833}
1834EXPORT_SYMBOL_GPL(perf_event_enable);
1835
1836int perf_event_refresh(struct perf_event *event, int refresh)
1837{
1838    /*
1839     * not supported on inherited events
1840     */
1841    if (event->attr.inherit || !is_sampling_event(event))
1842        return -EINVAL;
1843
1844    atomic_add(refresh, &event->event_limit);
1845    perf_event_enable(event);
1846
1847    return 0;
1848}
1849EXPORT_SYMBOL_GPL(perf_event_refresh);
1850
1851static void ctx_sched_out(struct perf_event_context *ctx,
1852              struct perf_cpu_context *cpuctx,
1853              enum event_type_t event_type)
1854{
1855    struct perf_event *event;
1856    int is_active = ctx->is_active;
1857
1858    ctx->is_active &= ~event_type;
1859    if (likely(!ctx->nr_events))
1860        return;
1861
1862    update_context_time(ctx);
1863    update_cgrp_time_from_cpuctx(cpuctx);
1864    if (!ctx->nr_active)
1865        return;
1866
1867    perf_pmu_disable(ctx->pmu);
1868    if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1869        list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1870            group_sched_out(event, cpuctx, ctx);
1871    }
1872
1873    if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1874        list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1875            group_sched_out(event, cpuctx, ctx);
1876    }
1877    perf_pmu_enable(ctx->pmu);
1878}
1879
1880/*
1881 * Test whether two contexts are equivalent, i.e. whether they
1882 * have both been cloned from the same version of the same context
1883 * and they both have the same number of enabled events.
1884 * If the number of enabled events is the same, then the set
1885 * of enabled events should be the same, because these are both
1886 * inherited contexts, therefore we can't access individual events
1887 * in them directly with an fd; we can only enable/disable all
1888 * events via prctl, or enable/disable all events in a family
1889 * via ioctl, which will have the same effect on both contexts.
1890 */
1891static int context_equiv(struct perf_event_context *ctx1,
1892             struct perf_event_context *ctx2)
1893{
1894    return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1895        && ctx1->parent_gen == ctx2->parent_gen
1896        && !ctx1->pin_count && !ctx2->pin_count;
1897}
1898
1899static void __perf_event_sync_stat(struct perf_event *event,
1900                     struct perf_event *next_event)
1901{
1902    u64 value;
1903
1904    if (!event->attr.inherit_stat)
1905        return;
1906
1907    /*
1908     * Update the event value, we cannot use perf_event_read()
1909     * because we're in the middle of a context switch and have IRQs
1910     * disabled, which upsets smp_call_function_single(), however
1911     * we know the event must be on the current CPU, therefore we
1912     * don't need to use it.
1913     */
1914    switch (event->state) {
1915    case PERF_EVENT_STATE_ACTIVE:
1916        event->pmu->read(event);
1917        /* fall-through */
1918
1919    case PERF_EVENT_STATE_INACTIVE:
1920        update_event_times(event);
1921        break;
1922
1923    default:
1924        break;
1925    }
1926
1927    /*
1928     * In order to keep per-task stats reliable we need to flip the event
1929     * values when we flip the contexts.
1930     */
1931    value = local64_read(&next_event->count);
1932    value = local64_xchg(&event->count, value);
1933    local64_set(&next_event->count, value);
1934
1935    swap(event->total_time_enabled, next_event->total_time_enabled);
1936    swap(event->total_time_running, next_event->total_time_running);
1937
1938    /*
1939     * Since we swizzled the values, update the user visible data too.
1940     */
1941    perf_event_update_userpage(event);
1942    perf_event_update_userpage(next_event);
1943}
1944
1945#define list_next_entry(pos, member) \
1946    list_entry(pos->member.next, typeof(*pos), member)
1947
1948static void perf_event_sync_stat(struct perf_event_context *ctx,
1949                   struct perf_event_context *next_ctx)
1950{
1951    struct perf_event *event, *next_event;
1952
1953    if (!ctx->nr_stat)
1954        return;
1955
1956    update_context_time(ctx);
1957
1958    event = list_first_entry(&ctx->event_list,
1959                   struct perf_event, event_entry);
1960
1961    next_event = list_first_entry(&next_ctx->event_list,
1962                    struct perf_event, event_entry);
1963
1964    while (&event->event_entry != &ctx->event_list &&
1965           &next_event->event_entry != &next_ctx->event_list) {
1966
1967        __perf_event_sync_stat(event, next_event);
1968
1969        event = list_next_entry(event, event_entry);
1970        next_event = list_next_entry(next_event, event_entry);
1971    }
1972}
1973
1974static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1975                     struct task_struct *next)
1976{
1977    struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1978    struct perf_event_context *next_ctx;
1979    struct perf_event_context *parent;
1980    struct perf_cpu_context *cpuctx;
1981    int do_switch = 1;
1982
1983    if (likely(!ctx))
1984        return;
1985
1986    cpuctx = __get_cpu_context(ctx);
1987    if (!cpuctx->task_ctx)
1988        return;
1989
1990    rcu_read_lock();
1991    parent = rcu_dereference(ctx->parent_ctx);
1992    next_ctx = next->perf_event_ctxp[ctxn];
1993    if (parent && next_ctx &&
1994        rcu_dereference(next_ctx->parent_ctx) == parent) {
1995        /*
1996         * Looks like the two contexts are clones, so we might be
1997         * able to optimize the context switch. We lock both
1998         * contexts and check that they are clones under the
1999         * lock (including re-checking that neither has been
2000         * uncloned in the meantime). It doesn't matter which
2001         * order we take the locks because no other cpu could
2002         * be trying to lock both of these tasks.
2003         */
2004        raw_spin_lock(&ctx->lock);
2005        raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2006        if (context_equiv(ctx, next_ctx)) {
2007            /*
2008             * XXX do we need a memory barrier of sorts
2009             * wrt to rcu_dereference() of perf_event_ctxp
2010             */
2011            task->perf_event_ctxp[ctxn] = next_ctx;
2012            next->perf_event_ctxp[ctxn] = ctx;
2013            ctx->task = next;
2014            next_ctx->task = task;
2015            do_switch = 0;
2016
2017            perf_event_sync_stat(ctx, next_ctx);
2018        }
2019        raw_spin_unlock(&next_ctx->lock);
2020        raw_spin_unlock(&ctx->lock);
2021    }
2022    rcu_read_unlock();
2023
2024    if (do_switch) {
2025        raw_spin_lock(&ctx->lock);
2026        ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2027        cpuctx->task_ctx = NULL;
2028        raw_spin_unlock(&ctx->lock);
2029    }
2030}
2031
2032#define for_each_task_context_nr(ctxn) \
2033    for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2034
2035/*
2036 * Called from scheduler to remove the events of the current task,
2037 * with interrupts disabled.
2038 *
2039 * We stop each event and update the event value in event->count.
2040 *
2041 * This does not protect us against NMI, but disable()
2042 * sets the disabled bit in the control field of event _before_
2043 * accessing the event control register. If a NMI hits, then it will
2044 * not restart the event.
2045 */
2046void __perf_event_task_sched_out(struct task_struct *task,
2047                 struct task_struct *next)
2048{
2049    int ctxn;
2050
2051    for_each_task_context_nr(ctxn)
2052        perf_event_context_sched_out(task, ctxn, next);
2053
2054    /*
2055     * if cgroup events exist on this CPU, then we need
2056     * to check if we have to switch out PMU state.
2057     * cgroup event are system-wide mode only
2058     */
2059    if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2060        perf_cgroup_sched_out(task, next);
2061}
2062
2063static void task_ctx_sched_out(struct perf_event_context *ctx)
2064{
2065    struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2066
2067    if (!cpuctx->task_ctx)
2068        return;
2069
2070    if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2071        return;
2072
2073    ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2074    cpuctx->task_ctx = NULL;
2075}
2076
2077/*
2078 * Called with IRQs disabled
2079 */
2080static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2081                  enum event_type_t event_type)
2082{
2083    ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2084}
2085
2086static void
2087ctx_pinned_sched_in(struct perf_event_context *ctx,
2088            struct perf_cpu_context *cpuctx)
2089{
2090    struct perf_event *event;
2091
2092    list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2093        if (event->state <= PERF_EVENT_STATE_OFF)
2094            continue;
2095        if (!event_filter_match(event))
2096            continue;
2097
2098        /* may need to reset tstamp_enabled */
2099        if (is_cgroup_event(event))
2100            perf_cgroup_mark_enabled(event, ctx);
2101
2102        if (group_can_go_on(event, cpuctx, 1))
2103            group_sched_in(event, cpuctx, ctx);
2104
2105        /*
2106         * If this pinned group hasn't been scheduled,
2107         * put it in error state.
2108         */
2109        if (event->state == PERF_EVENT_STATE_INACTIVE) {
2110            update_group_times(event);
2111            event->state = PERF_EVENT_STATE_ERROR;
2112        }
2113    }
2114}
2115
2116static void
2117ctx_flexible_sched_in(struct perf_event_context *ctx,
2118              struct perf_cpu_context *cpuctx)
2119{
2120    struct perf_event *event;
2121    int can_add_hw = 1;
2122
2123    list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2124        /* Ignore events in OFF or ERROR state */
2125        if (event->state <= PERF_EVENT_STATE_OFF)
2126            continue;
2127        /*
2128         * Listen to the 'cpu' scheduling filter constraint
2129         * of events:
2130         */
2131        if (!event_filter_match(event))
2132            continue;
2133
2134        /* may need to reset tstamp_enabled */
2135        if (is_cgroup_event(event))
2136            perf_cgroup_mark_enabled(event, ctx);
2137
2138        if (group_can_go_on(event, cpuctx, can_add_hw)) {
2139            if (group_sched_in(event, cpuctx, ctx))
2140                can_add_hw = 0;
2141        }
2142    }
2143}
2144
2145static void
2146ctx_sched_in(struct perf_event_context *ctx,
2147         struct perf_cpu_context *cpuctx,
2148         enum event_type_t event_type,
2149         struct task_struct *task)
2150{
2151    u64 now;
2152    int is_active = ctx->is_active;
2153
2154    ctx->is_active |= event_type;
2155    if (likely(!ctx->nr_events))
2156        return;
2157
2158    now = perf_clock();
2159    ctx->timestamp = now;
2160    perf_cgroup_set_timestamp(task, ctx);
2161    /*
2162     * First go through the list and put on any pinned groups
2163     * in order to give them the best chance of going on.
2164     */
2165    if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2166        ctx_pinned_sched_in(ctx, cpuctx);
2167
2168    /* Then walk through the lower prio flexible groups */
2169    if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2170        ctx_flexible_sched_in(ctx, cpuctx);
2171}
2172
2173static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2174                 enum event_type_t event_type,
2175                 struct task_struct *task)
2176{
2177    struct perf_event_context *ctx = &cpuctx->ctx;
2178
2179    ctx_sched_in(ctx, cpuctx, event_type, task);
2180}
2181
2182static void perf_event_context_sched_in(struct perf_event_context *ctx,
2183                    struct task_struct *task)
2184{
2185    struct perf_cpu_context *cpuctx;
2186
2187    cpuctx = __get_cpu_context(ctx);
2188    if (cpuctx->task_ctx == ctx)
2189        return;
2190
2191    perf_ctx_lock(cpuctx, ctx);
2192    perf_pmu_disable(ctx->pmu);
2193    /*
2194     * We want to keep the following priority order:
2195     * cpu pinned (that don't need to move), task pinned,
2196     * cpu flexible, task flexible.
2197     */
2198    cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2199
2200    if (ctx->nr_events)
2201        cpuctx->task_ctx = ctx;
2202
2203    perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2204
2205    perf_pmu_enable(ctx->pmu);
2206    perf_ctx_unlock(cpuctx, ctx);
2207
2208    /*
2209     * Since these rotations are per-cpu, we need to ensure the
2210     * cpu-context we got scheduled on is actually rotating.
2211     */
2212    perf_pmu_rotate_start(ctx->pmu);
2213}
2214
2215/*
2216 * When sampling the branck stack in system-wide, it may be necessary
2217 * to flush the stack on context switch. This happens when the branch
2218 * stack does not tag its entries with the pid of the current task.
2219 * Otherwise it becomes impossible to associate a branch entry with a
2220 * task. This ambiguity is more likely to appear when the branch stack
2221 * supports priv level filtering and the user sets it to monitor only
2222 * at the user level (which could be a useful measurement in system-wide
2223 * mode). In that case, the risk is high of having a branch stack with
2224 * branch from multiple tasks. Flushing may mean dropping the existing
2225 * entries or stashing them somewhere in the PMU specific code layer.
2226 *
2227 * This function provides the context switch callback to the lower code
2228 * layer. It is invoked ONLY when there is at least one system-wide context
2229 * with at least one active event using taken branch sampling.
2230 */
2231static void perf_branch_stack_sched_in(struct task_struct *prev,
2232                       struct task_struct *task)
2233{
2234    struct perf_cpu_context *cpuctx;
2235    struct pmu *pmu;
2236    unsigned long flags;
2237
2238    /* no need to flush branch stack if not changing task */
2239    if (prev == task)
2240        return;
2241
2242    local_irq_save(flags);
2243
2244    rcu_read_lock();
2245
2246    list_for_each_entry_rcu(pmu, &pmus, entry) {
2247        cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2248
2249        /*
2250         * check if the context has at least one
2251         * event using PERF_SAMPLE_BRANCH_STACK
2252         */
2253        if (cpuctx->ctx.nr_branch_stack > 0
2254            && pmu->flush_branch_stack) {
2255
2256            pmu = cpuctx->ctx.pmu;
2257
2258            perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2259
2260            perf_pmu_disable(pmu);
2261
2262            pmu->flush_branch_stack();
2263
2264            perf_pmu_enable(pmu);
2265
2266            perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2267        }
2268    }
2269
2270    rcu_read_unlock();
2271
2272    local_irq_restore(flags);
2273}
2274
2275/*
2276 * Called from scheduler to add the events of the current task
2277 * with interrupts disabled.
2278 *
2279 * We restore the event value and then enable it.
2280 *
2281 * This does not protect us against NMI, but enable()
2282 * sets the enabled bit in the control field of event _before_
2283 * accessing the event control register. If a NMI hits, then it will
2284 * keep the event running.
2285 */
2286void __perf_event_task_sched_in(struct task_struct *prev,
2287                struct task_struct *task)
2288{
2289    struct perf_event_context *ctx;
2290    int ctxn;
2291
2292    for_each_task_context_nr(ctxn) {
2293        ctx = task->perf_event_ctxp[ctxn];
2294        if (likely(!ctx))
2295            continue;
2296
2297        perf_event_context_sched_in(ctx, task);
2298    }
2299    /*
2300     * if cgroup events exist on this CPU, then we need
2301     * to check if we have to switch in PMU state.
2302     * cgroup event are system-wide mode only
2303     */
2304    if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2305        perf_cgroup_sched_in(prev, task);
2306
2307    /* check for system-wide branch_stack events */
2308    if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2309        perf_branch_stack_sched_in(prev, task);
2310}
2311
2312static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2313{
2314    u64 frequency = event->attr.sample_freq;
2315    u64 sec = NSEC_PER_SEC;
2316    u64 divisor, dividend;
2317
2318    int count_fls, nsec_fls, frequency_fls, sec_fls;
2319
2320    count_fls = fls64(count);
2321    nsec_fls = fls64(nsec);
2322    frequency_fls = fls64(frequency);
2323    sec_fls = 30;
2324
2325    /*
2326     * We got @count in @nsec, with a target of sample_freq HZ
2327     * the target period becomes:
2328     *
2329     * @count * 10^9
2330     * period = -------------------
2331     * @nsec * sample_freq
2332     *
2333     */
2334
2335    /*
2336     * Reduce accuracy by one bit such that @a and @b converge
2337     * to a similar magnitude.
2338     */
2339#define REDUCE_FLS(a, b) \
2340do { \
2341    if (a##_fls > b##_fls) { \
2342        a >>= 1; \
2343        a##_fls--; \
2344    } else { \
2345        b >>= 1; \
2346        b##_fls--; \
2347    } \
2348} while (0)
2349
2350    /*
2351     * Reduce accuracy until either term fits in a u64, then proceed with
2352     * the other, so that finally we can do a u64/u64 division.
2353     */
2354    while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2355        REDUCE_FLS(nsec, frequency);
2356        REDUCE_FLS(sec, count);
2357    }
2358
2359    if (count_fls + sec_fls > 64) {
2360        divisor = nsec * frequency;
2361
2362        while (count_fls + sec_fls > 64) {
2363            REDUCE_FLS(count, sec);
2364            divisor >>= 1;
2365        }
2366
2367        dividend = count * sec;
2368    } else {
2369        dividend = count * sec;
2370
2371        while (nsec_fls + frequency_fls > 64) {
2372            REDUCE_FLS(nsec, frequency);
2373            dividend >>= 1;
2374        }
2375
2376        divisor = nsec * frequency;
2377    }
2378
2379    if (!divisor)
2380        return dividend;
2381
2382    return div64_u64(dividend, divisor);
2383}
2384
2385static DEFINE_PER_CPU(int, perf_throttled_count);
2386static DEFINE_PER_CPU(u64, perf_throttled_seq);
2387
2388static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2389{
2390    struct hw_perf_event *hwc = &event->hw;
2391    s64 period, sample_period;
2392    s64 delta;
2393
2394    period = perf_calculate_period(event, nsec, count);
2395
2396    delta = (s64)(period - hwc->sample_period);
2397    delta = (delta + 7) / 8; /* low pass filter */
2398
2399    sample_period = hwc->sample_period + delta;
2400
2401    if (!sample_period)
2402        sample_period = 1;
2403
2404    hwc->sample_period = sample_period;
2405
2406    if (local64_read(&hwc->period_left) > 8*sample_period) {
2407        if (disable)
2408            event->pmu->stop(event, PERF_EF_UPDATE);
2409
2410        local64_set(&hwc->period_left, 0);
2411
2412        if (disable)
2413            event->pmu->start(event, PERF_EF_RELOAD);
2414    }
2415}
2416
2417/*
2418 * combine freq adjustment with unthrottling to avoid two passes over the
2419 * events. At the same time, make sure, having freq events does not change
2420 * the rate of unthrottling as that would introduce bias.
2421 */
2422static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2423                       int needs_unthr)
2424{
2425    struct perf_event *event;
2426    struct hw_perf_event *hwc;
2427    u64 now, period = TICK_NSEC;
2428    s64 delta;
2429
2430    /*
2431     * only need to iterate over all events iff:
2432     * - context have events in frequency mode (needs freq adjust)
2433     * - there are events to unthrottle on this cpu
2434     */
2435    if (!(ctx->nr_freq || needs_unthr))
2436        return;
2437
2438    raw_spin_lock(&ctx->lock);
2439    perf_pmu_disable(ctx->pmu);
2440
2441    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2442        if (event->state != PERF_EVENT_STATE_ACTIVE)
2443            continue;
2444
2445        if (!event_filter_match(event))
2446            continue;
2447
2448        hwc = &event->hw;
2449
2450        if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
2451            hwc->interrupts = 0;
2452            perf_log_throttle(event, 1);
2453            event->pmu->start(event, 0);
2454        }
2455
2456        if (!event->attr.freq || !event->attr.sample_freq)
2457            continue;
2458
2459        /*
2460         * stop the event and update event->count
2461         */
2462        event->pmu->stop(event, PERF_EF_UPDATE);
2463
2464        now = local64_read(&event->count);
2465        delta = now - hwc->freq_count_stamp;
2466        hwc->freq_count_stamp = now;
2467
2468        /*
2469         * restart the event
2470         * reload only if value has changed
2471         * we have stopped the event so tell that
2472         * to perf_adjust_period() to avoid stopping it
2473         * twice.
2474         */
2475        if (delta > 0)
2476            perf_adjust_period(event, period, delta, false);
2477
2478        event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2479    }
2480
2481    perf_pmu_enable(ctx->pmu);
2482    raw_spin_unlock(&ctx->lock);
2483}
2484
2485/*
2486 * Round-robin a context's events:
2487 */
2488static void rotate_ctx(struct perf_event_context *ctx)
2489{
2490    /*
2491     * Rotate the first entry last of non-pinned groups. Rotation might be
2492     * disabled by the inheritance code.
2493     */
2494    if (!ctx->rotate_disable)
2495        list_rotate_left(&ctx->flexible_groups);
2496}
2497
2498/*
2499 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2500 * because they're strictly cpu affine and rotate_start is called with IRQs
2501 * disabled, while rotate_context is called from IRQ context.
2502 */
2503static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2504{
2505    struct perf_event_context *ctx = NULL;
2506    int rotate = 0, remove = 1;
2507
2508    if (cpuctx->ctx.nr_events) {
2509        remove = 0;
2510        if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2511            rotate = 1;
2512    }
2513
2514    ctx = cpuctx->task_ctx;
2515    if (ctx && ctx->nr_events) {
2516        remove = 0;
2517        if (ctx->nr_events != ctx->nr_active)
2518            rotate = 1;
2519    }
2520
2521    if (!rotate)
2522        goto done;
2523
2524    perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2525    perf_pmu_disable(cpuctx->ctx.pmu);
2526
2527    cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2528    if (ctx)
2529        ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2530
2531    rotate_ctx(&cpuctx->ctx);
2532    if (ctx)
2533        rotate_ctx(ctx);
2534
2535    perf_event_sched_in(cpuctx, ctx, current);
2536
2537    perf_pmu_enable(cpuctx->ctx.pmu);
2538    perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2539done:
2540    if (remove)
2541        list_del_init(&cpuctx->rotation_list);
2542}
2543
2544void perf_event_task_tick(void)
2545{
2546    struct list_head *head = &__get_cpu_var(rotation_list);
2547    struct perf_cpu_context *cpuctx, *tmp;
2548    struct perf_event_context *ctx;
2549    int throttled;
2550
2551    WARN_ON(!irqs_disabled());
2552
2553    __this_cpu_inc(perf_throttled_seq);
2554    throttled = __this_cpu_xchg(perf_throttled_count, 0);
2555
2556    list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2557        ctx = &cpuctx->ctx;
2558        perf_adjust_freq_unthr_context(ctx, throttled);
2559
2560        ctx = cpuctx->task_ctx;
2561        if (ctx)
2562            perf_adjust_freq_unthr_context(ctx, throttled);
2563
2564        if (cpuctx->jiffies_interval == 1 ||
2565                !(jiffies % cpuctx->jiffies_interval))
2566            perf_rotate_context(cpuctx);
2567    }
2568}
2569
2570static int event_enable_on_exec(struct perf_event *event,
2571                struct perf_event_context *ctx)
2572{
2573    if (!event->attr.enable_on_exec)
2574        return 0;
2575
2576    event->attr.enable_on_exec = 0;
2577    if (event->state >= PERF_EVENT_STATE_INACTIVE)
2578        return 0;
2579
2580    __perf_event_mark_enabled(event);
2581
2582    return 1;
2583}
2584
2585/*
2586 * Enable all of a task's events that have been marked enable-on-exec.
2587 * This expects task == current.
2588 */
2589static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2590{
2591    struct perf_event *event;
2592    unsigned long flags;
2593    int enabled = 0;
2594    int ret;
2595
2596    local_irq_save(flags);
2597    if (!ctx || !ctx->nr_events)
2598        goto out;
2599
2600    /*
2601     * We must ctxsw out cgroup events to avoid conflict
2602     * when invoking perf_task_event_sched_in() later on
2603     * in this function. Otherwise we end up trying to
2604     * ctxswin cgroup events which are already scheduled
2605     * in.
2606     */
2607    perf_cgroup_sched_out(current, NULL);
2608
2609    raw_spin_lock(&ctx->lock);
2610    task_ctx_sched_out(ctx);
2611
2612    list_for_each_entry(event, &ctx->event_list, event_entry) {
2613        ret = event_enable_on_exec(event, ctx);
2614        if (ret)
2615            enabled = 1;
2616    }
2617
2618    /*
2619     * Unclone this context if we enabled any event.
2620     */
2621    if (enabled)
2622        unclone_ctx(ctx);
2623
2624    raw_spin_unlock(&ctx->lock);
2625
2626    /*
2627     * Also calls ctxswin for cgroup events, if any:
2628     */
2629    perf_event_context_sched_in(ctx, ctx->task);
2630out:
2631    local_irq_restore(flags);
2632}
2633
2634/*
2635 * Cross CPU call to read the hardware event
2636 */
2637static void __perf_event_read(void *info)
2638{
2639    struct perf_event *event = info;
2640    struct perf_event_context *ctx = event->ctx;
2641    struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2642
2643    /*
2644     * If this is a task context, we need to check whether it is
2645     * the current task context of this cpu. If not it has been
2646     * scheduled out before the smp call arrived. In that case
2647     * event->count would have been updated to a recent sample
2648     * when the event was scheduled out.
2649     */
2650    if (ctx->task && cpuctx->task_ctx != ctx)
2651        return;
2652
2653    raw_spin_lock(&ctx->lock);
2654    if (ctx->is_active) {
2655        update_context_time(ctx);
2656        update_cgrp_time_from_event(event);
2657    }
2658    update_event_times(event);
2659    if (event->state == PERF_EVENT_STATE_ACTIVE)
2660        event->pmu->read(event);
2661    raw_spin_unlock(&ctx->lock);
2662}
2663
2664static inline u64 perf_event_count(struct perf_event *event)
2665{
2666    return local64_read(&event->count) + atomic64_read(&event->child_count);
2667}
2668
2669static u64 perf_event_read(struct perf_event *event)
2670{
2671    /*
2672     * If event is enabled and currently active on a CPU, update the
2673     * value in the event structure:
2674     */
2675    if (event->state == PERF_EVENT_STATE_ACTIVE) {
2676        smp_call_function_single(event->oncpu,
2677                     __perf_event_read, event, 1);
2678    } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
2679        struct perf_event_context *ctx = event->ctx;
2680        unsigned long flags;
2681
2682        raw_spin_lock_irqsave(&ctx->lock, flags);
2683        /*
2684         * may read while context is not active
2685         * (e.g., thread is blocked), in that case
2686         * we cannot update context time
2687         */
2688        if (ctx->is_active) {
2689            update_context_time(ctx);
2690            update_cgrp_time_from_event(event);
2691        }
2692        update_event_times(event);
2693        raw_spin_unlock_irqrestore(&ctx->lock, flags);
2694    }
2695
2696    return perf_event_count(event);
2697}
2698
2699/*
2700 * Initialize the perf_event context in a task_struct:
2701 */
2702static void __perf_event_init_context(struct perf_event_context *ctx)
2703{
2704    raw_spin_lock_init(&ctx->lock);
2705    mutex_init(&ctx->mutex);
2706    INIT_LIST_HEAD(&ctx->pinned_groups);
2707    INIT_LIST_HEAD(&ctx->flexible_groups);
2708    INIT_LIST_HEAD(&ctx->event_list);
2709    atomic_set(&ctx->refcount, 1);
2710}
2711
2712static struct perf_event_context *
2713alloc_perf_context(struct pmu *pmu, struct task_struct *task)
2714{
2715    struct perf_event_context *ctx;
2716
2717    ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2718    if (!ctx)
2719        return NULL;
2720
2721    __perf_event_init_context(ctx);
2722    if (task) {
2723        ctx->task = task;
2724        get_task_struct(task);
2725    }
2726    ctx->pmu = pmu;
2727
2728    return ctx;
2729}
2730
2731static struct task_struct *
2732find_lively_task_by_vpid(pid_t vpid)
2733{
2734    struct task_struct *task;
2735    int err;
2736
2737    rcu_read_lock();
2738    if (!vpid)
2739        task = current;
2740    else
2741        task = find_task_by_vpid(vpid);
2742    if (task)
2743        get_task_struct(task);
2744    rcu_read_unlock();
2745
2746    if (!task)
2747        return ERR_PTR(-ESRCH);
2748
2749    /* Reuse ptrace permission checks for now. */
2750    err = -EACCES;
2751    if (!ptrace_may_access(task, PTRACE_MODE_READ))
2752        goto errout;
2753
2754    return task;
2755errout:
2756    put_task_struct(task);
2757    return ERR_PTR(err);
2758
2759}
2760
2761/*
2762 * Returns a matching context with refcount and pincount.
2763 */
2764static struct perf_event_context *
2765find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2766{
2767    struct perf_event_context *ctx;
2768    struct perf_cpu_context *cpuctx;
2769    unsigned long flags;
2770    int ctxn, err;
2771
2772    if (!task) {
2773        /* Must be root to operate on a CPU event: */
2774        if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2775            return ERR_PTR(-EACCES);
2776
2777        /*
2778         * We could be clever and allow to attach a event to an
2779         * offline CPU and activate it when the CPU comes up, but
2780         * that's for later.
2781         */
2782        if (!cpu_online(cpu))
2783            return ERR_PTR(-ENODEV);
2784
2785        cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2786        ctx = &cpuctx->ctx;
2787        get_ctx(ctx);
2788        ++ctx->pin_count;
2789
2790        return ctx;
2791    }
2792
2793    err = -EINVAL;
2794    ctxn = pmu->task_ctx_nr;
2795    if (ctxn < 0)
2796        goto errout;
2797
2798retry:
2799    ctx = perf_lock_task_context(task, ctxn, &flags);
2800    if (ctx) {
2801        unclone_ctx(ctx);
2802        ++ctx->pin_count;
2803        raw_spin_unlock_irqrestore(&ctx->lock, flags);
2804    } else {
2805        ctx = alloc_perf_context(pmu, task);
2806        err = -ENOMEM;
2807        if (!ctx)
2808            goto errout;
2809
2810        err = 0;
2811        mutex_lock(&task->perf_event_mutex);
2812        /*
2813         * If it has already passed perf_event_exit_task().
2814         * we must see PF_EXITING, it takes this mutex too.
2815         */
2816        if (task->flags & PF_EXITING)
2817            err = -ESRCH;
2818        else if (task->perf_event_ctxp[ctxn])
2819            err = -EAGAIN;
2820        else {
2821            get_ctx(ctx);
2822            ++ctx->pin_count;
2823            rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2824        }
2825        mutex_unlock(&task->perf_event_mutex);
2826
2827        if (unlikely(err)) {
2828            put_ctx(ctx);
2829
2830            if (err == -EAGAIN)
2831                goto retry;
2832            goto errout;
2833        }
2834    }
2835
2836    return ctx;
2837
2838errout:
2839    return ERR_PTR(err);
2840}
2841
2842static void perf_event_free_filter(struct perf_event *event);
2843
2844static void free_event_rcu(struct rcu_head *head)
2845{
2846    struct perf_event *event;
2847
2848    event = container_of(head, struct perf_event, rcu_head);
2849    if (event->ns)
2850        put_pid_ns(event->ns);
2851    perf_event_free_filter(event);
2852    kfree(event);
2853}
2854
2855static void ring_buffer_put(struct ring_buffer *rb);
2856
2857static void free_event(struct perf_event *event)
2858{
2859    irq_work_sync(&event->pending);
2860
2861    if (!event->parent) {
2862        if (event->attach_state & PERF_ATTACH_TASK)
2863            static_key_slow_dec_deferred(&perf_sched_events);
2864        if (event->attr.mmap || event->attr.mmap_data)
2865            atomic_dec(&nr_mmap_events);
2866        if (event->attr.comm)
2867            atomic_dec(&nr_comm_events);
2868        if (event->attr.task)
2869            atomic_dec(&nr_task_events);
2870        if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2871            put_callchain_buffers();
2872        if (is_cgroup_event(event)) {
2873            atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2874            static_key_slow_dec_deferred(&perf_sched_events);
2875        }
2876
2877        if (has_branch_stack(event)) {
2878            static_key_slow_dec_deferred(&perf_sched_events);
2879            /* is system-wide event */
2880            if (!(event->attach_state & PERF_ATTACH_TASK))
2881                atomic_dec(&per_cpu(perf_branch_stack_events,
2882                            event->cpu));
2883        }
2884    }
2885
2886    if (event->rb) {
2887        ring_buffer_put(event->rb);
2888        event->rb = NULL;
2889    }
2890
2891    if (is_cgroup_event(event))
2892        perf_detach_cgroup(event);
2893
2894    if (event->destroy)
2895        event->destroy(event);
2896
2897    if (event->ctx)
2898        put_ctx(event->ctx);
2899
2900    call_rcu(&event->rcu_head, free_event_rcu);
2901}
2902
2903int perf_event_release_kernel(struct perf_event *event)
2904{
2905    struct perf_event_context *ctx = event->ctx;
2906
2907    WARN_ON_ONCE(ctx->parent_ctx);
2908    /*
2909     * There are two ways this annotation is useful:
2910     *
2911     * 1) there is a lock recursion from perf_event_exit_task
2912     * see the comment there.
2913     *
2914     * 2) there is a lock-inversion with mmap_sem through
2915     * perf_event_read_group(), which takes faults while
2916     * holding ctx->mutex, however this is called after
2917     * the last filedesc died, so there is no possibility
2918     * to trigger the AB-BA case.
2919     */
2920    mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2921    raw_spin_lock_irq(&ctx->lock);
2922    perf_group_detach(event);
2923    raw_spin_unlock_irq(&ctx->lock);
2924    perf_remove_from_context(event);
2925    mutex_unlock(&ctx->mutex);
2926
2927    free_event(event);
2928
2929    return 0;
2930}
2931EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2932
2933/*
2934 * Called when the last reference to the file is gone.
2935 */
2936static int perf_release(struct inode *inode, struct file *file)
2937{
2938    struct perf_event *event = file->private_data;
2939    struct task_struct *owner;
2940
2941    file->private_data = NULL;
2942
2943    rcu_read_lock();
2944    owner = ACCESS_ONCE(event->owner);
2945    /*
2946     * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2947     * !owner it means the list deletion is complete and we can indeed
2948     * free this event, otherwise we need to serialize on
2949     * owner->perf_event_mutex.
2950     */
2951    smp_read_barrier_depends();
2952    if (owner) {
2953        /*
2954         * Since delayed_put_task_struct() also drops the last
2955         * task reference we can safely take a new reference
2956         * while holding the rcu_read_lock().
2957         */
2958        get_task_struct(owner);
2959    }
2960    rcu_read_unlock();
2961
2962    if (owner) {
2963        mutex_lock(&owner->perf_event_mutex);
2964        /*
2965         * We have to re-check the event->owner field, if it is cleared
2966         * we raced with perf_event_exit_task(), acquiring the mutex
2967         * ensured they're done, and we can proceed with freeing the
2968         * event.
2969         */
2970        if (event->owner)
2971            list_del_init(&event->owner_entry);
2972        mutex_unlock(&owner->perf_event_mutex);
2973        put_task_struct(owner);
2974    }
2975
2976    return perf_event_release_kernel(event);
2977}
2978
2979u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2980{
2981    struct perf_event *child;
2982    u64 total = 0;
2983
2984    *enabled = 0;
2985    *running = 0;
2986
2987    mutex_lock(&event->child_mutex);
2988    total += perf_event_read(event);
2989    *enabled += event->total_time_enabled +
2990            atomic64_read(&event->child_total_time_enabled);
2991    *running += event->total_time_running +
2992            atomic64_read(&event->child_total_time_running);
2993
2994    list_for_each_entry(child, &event->child_list, child_list) {
2995        total += perf_event_read(child);
2996        *enabled += child->total_time_enabled;
2997        *running += child->total_time_running;
2998    }
2999    mutex_unlock(&event->child_mutex);
3000
3001    return total;
3002}
3003EXPORT_SYMBOL_GPL(perf_event_read_value);
3004
3005static int perf_event_read_group(struct perf_event *event,
3006                   u64 read_format, char __user *buf)
3007{
3008    struct perf_event *leader = event->group_leader, *sub;
3009    int n = 0, size = 0, ret = -EFAULT;
3010    struct perf_event_context *ctx = leader->ctx;
3011    u64 values[5];
3012    u64 count, enabled, running;
3013
3014    mutex_lock(&ctx->mutex);
3015    count = perf_event_read_value(leader, &enabled, &running);
3016
3017    values[n++] = 1 + leader->nr_siblings;
3018    if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3019        values[n++] = enabled;
3020    if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3021        values[n++] = running;
3022    values[n++] = count;
3023    if (read_format & PERF_FORMAT_ID)
3024        values[n++] = primary_event_id(leader);
3025
3026    size = n * sizeof(u64);
3027
3028    if (copy_to_user(buf, values, size))
3029        goto unlock;
3030
3031    ret = size;
3032
3033    list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3034        n = 0;
3035
3036        values[n++] = perf_event_read_value(sub, &enabled, &running);
3037        if (read_format & PERF_FORMAT_ID)
3038            values[n++] = primary_event_id(sub);
3039
3040        size = n * sizeof(u64);
3041
3042        if (copy_to_user(buf + ret, values, size)) {
3043            ret = -EFAULT;
3044            goto unlock;
3045        }
3046
3047        ret += size;
3048    }
3049unlock:
3050    mutex_unlock(&ctx->mutex);
3051
3052    return ret;
3053}
3054
3055static int perf_event_read_one(struct perf_event *event,
3056                 u64 read_format, char __user *buf)
3057{
3058    u64 enabled, running;
3059    u64 values[4];
3060    int n = 0;
3061
3062    values[n++] = perf_event_read_value(event, &enabled, &running);
3063    if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3064        values[n++] = enabled;
3065    if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3066        values[n++] = running;
3067    if (read_format & PERF_FORMAT_ID)
3068        values[n++] = primary_event_id(event);
3069
3070    if (copy_to_user(buf, values, n * sizeof(u64)))
3071        return -EFAULT;
3072
3073    return n * sizeof(u64);
3074}
3075
3076/*
3077 * Read the performance event - simple non blocking version for now
3078 */
3079static ssize_t
3080perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3081{
3082    u64 read_format = event->attr.read_format;
3083    int ret;
3084
3085    /*
3086     * Return end-of-file for a read on a event that is in
3087     * error state (i.e. because it was pinned but it couldn't be
3088     * scheduled on to the CPU at some point).
3089     */
3090    if (event->state == PERF_EVENT_STATE_ERROR)
3091        return 0;
3092
3093    if (count < event->read_size)
3094        return -ENOSPC;
3095
3096    WARN_ON_ONCE(event->ctx->parent_ctx);
3097    if (read_format & PERF_FORMAT_GROUP)
3098        ret = perf_event_read_group(event, read_format, buf);
3099    else
3100        ret = perf_event_read_one(event, read_format, buf);
3101
3102    return ret;
3103}
3104
3105static ssize_t
3106perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3107{
3108    struct perf_event *event = file->private_data;
3109
3110    return perf_read_hw(event, buf, count);
3111}
3112
3113static unsigned int perf_poll(struct file *file, poll_table *wait)
3114{
3115    struct perf_event *event = file->private_data;
3116    struct ring_buffer *rb;
3117    unsigned int events = POLL_HUP;
3118
3119    /*
3120     * Race between perf_event_set_output() and perf_poll(): perf_poll()
3121     * grabs the rb reference but perf_event_set_output() overrides it.
3122     * Here is the timeline for two threads T1, T2:
3123     * t0: T1, rb = rcu_dereference(event->rb)
3124     * t1: T2, old_rb = event->rb
3125     * t2: T2, event->rb = new rb
3126     * t3: T2, ring_buffer_detach(old_rb)
3127     * t4: T1, ring_buffer_attach(rb1)
3128     * t5: T1, poll_wait(event->waitq)
3129     *
3130     * To avoid this problem, we grab mmap_mutex in perf_poll()
3131     * thereby ensuring that the assignment of the new ring buffer
3132     * and the detachment of the old buffer appear atomic to perf_poll()
3133     */
3134    mutex_lock(&event->mmap_mutex);
3135
3136    rcu_read_lock();
3137    rb = rcu_dereference(event->rb);
3138    if (rb) {
3139        ring_buffer_attach(event, rb);
3140        events = atomic_xchg(&rb->poll, 0);
3141    }
3142    rcu_read_unlock();
3143
3144    mutex_unlock(&event->mmap_mutex);
3145
3146    poll_wait(file, &event->waitq, wait);
3147
3148    return events;
3149}
3150
3151static void perf_event_reset(struct perf_event *event)
3152{
3153    (void)perf_event_read(event);
3154    local64_set(&event->count, 0);
3155    perf_event_update_userpage(event);
3156}
3157
3158/*
3159 * Holding the top-level event's child_mutex means that any
3160 * descendant process that has inherited this event will block
3161 * in sync_child_event if it goes to exit, thus satisfying the
3162 * task existence requirements of perf_event_enable/disable.
3163 */
3164static void perf_event_for_each_child(struct perf_event *event,
3165                    void (*func)(struct perf_event *))
3166{
3167    struct perf_event *child;
3168
3169    WARN_ON_ONCE(event->ctx->parent_ctx);
3170    mutex_lock(&event->child_mutex);
3171    func(event);
3172    list_for_each_entry(child, &event->child_list, child_list)
3173        func(child);
3174    mutex_unlock(&event->child_mutex);
3175}
3176
3177static void perf_event_for_each(struct perf_event *event,
3178                  void (*func)(struct perf_event *))
3179{
3180    struct perf_event_context *ctx = event->ctx;
3181    struct perf_event *sibling;
3182
3183    WARN_ON_ONCE(ctx->parent_ctx);
3184    mutex_lock(&ctx->mutex);
3185    event = event->group_leader;
3186
3187    perf_event_for_each_child(event, func);
3188    list_for_each_entry(sibling, &event->sibling_list, group_entry)
3189        perf_event_for_each_child(sibling, func);
3190    mutex_unlock(&ctx->mutex);
3191}
3192
3193static int perf_event_period(struct perf_event *event, u64 __user *arg)
3194{
3195    struct perf_event_context *ctx = event->ctx;
3196    int ret = 0;
3197    u64 value;
3198
3199    if (!is_sampling_event(event))
3200        return -EINVAL;
3201
3202    if (copy_from_user(&value, arg, sizeof(value)))
3203        return -EFAULT;
3204
3205    if (!value)
3206        return -EINVAL;
3207
3208    raw_spin_lock_irq(&ctx->lock);
3209    if (event->attr.freq) {
3210        if (value > sysctl_perf_event_sample_rate) {
3211            ret = -EINVAL;
3212            goto unlock;
3213        }
3214
3215        event->attr.sample_freq = value;
3216    } else {
3217        event->attr.sample_period = value;
3218        event->hw.sample_period = value;
3219    }
3220unlock:
3221    raw_spin_unlock_irq(&ctx->lock);
3222
3223    return ret;
3224}
3225
3226static const struct file_operations perf_fops;
3227
3228static struct perf_event *perf_fget_light(int fd, int *fput_needed)
3229{
3230    struct file *file;
3231
3232    file = fget_light(fd, fput_needed);
3233    if (!file)
3234        return ERR_PTR(-EBADF);
3235
3236    if (file->f_op != &perf_fops) {
3237        fput_light(file, *fput_needed);
3238        *fput_needed = 0;
3239        return ERR_PTR(-EBADF);
3240    }
3241
3242    return file->private_data;
3243}
3244
3245static int perf_event_set_output(struct perf_event *event,
3246                 struct perf_event *output_event);
3247static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3248
3249static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3250{
3251    struct perf_event *event = file->private_data;
3252    void (*func)(struct perf_event *);
3253    u32 flags = arg;
3254
3255    switch (cmd) {
3256    case PERF_EVENT_IOC_ENABLE:
3257        func = perf_event_enable;
3258        break;
3259    case PERF_EVENT_IOC_DISABLE:
3260        func = perf_event_disable;
3261        break;
3262    case PERF_EVENT_IOC_RESET:
3263        func = perf_event_reset;
3264        break;
3265
3266    case PERF_EVENT_IOC_REFRESH:
3267        return perf_event_refresh(event, arg);
3268
3269    case PERF_EVENT_IOC_PERIOD:
3270        return perf_event_period(event, (u64 __user *)arg);
3271
3272    case PERF_EVENT_IOC_SET_OUTPUT:
3273    {
3274        struct perf_event *output_event = NULL;
3275        int fput_needed = 0;
3276        int ret;
3277
3278        if (arg != -1) {
3279            output_event = perf_fget_light(arg, &fput_needed);
3280            if (IS_ERR(output_event))
3281                return PTR_ERR(output_event);
3282        }
3283
3284        ret = perf_event_set_output(event, output_event);
3285        if (output_event)
3286            fput_light(output_event->filp, fput_needed);
3287
3288        return ret;
3289    }
3290
3291    case PERF_EVENT_IOC_SET_FILTER:
3292        return perf_event_set_filter(event, (void __user *)arg);
3293
3294    default:
3295        return -ENOTTY;
3296    }
3297
3298    if (flags & PERF_IOC_FLAG_GROUP)
3299        perf_event_for_each(event, func);
3300    else
3301        perf_event_for_each_child(event, func);
3302
3303    return 0;
3304}
3305
3306int perf_event_task_enable(void)
3307{
3308    struct perf_event *event;
3309
3310    mutex_lock(&current->perf_event_mutex);
3311    list_for_each_entry(event, &current->perf_event_list, owner_entry)
3312        perf_event_for_each_child(event, perf_event_enable);
3313    mutex_unlock(&current->perf_event_mutex);
3314
3315    return 0;
3316}
3317
3318int perf_event_task_disable(void)
3319{
3320    struct perf_event *event;
3321
3322    mutex_lock(&current->perf_event_mutex);
3323    list_for_each_entry(event, &current->perf_event_list, owner_entry)
3324        perf_event_for_each_child(event, perf_event_disable);
3325    mutex_unlock(&current->perf_event_mutex);
3326
3327    return 0;
3328}
3329
3330static int perf_event_index(struct perf_event *event)
3331{
3332    if (event->hw.state & PERF_HES_STOPPED)
3333        return 0;
3334
3335    if (event->state != PERF_EVENT_STATE_ACTIVE)
3336        return 0;
3337
3338    return event->pmu->event_idx(event);
3339}
3340
3341static void calc_timer_values(struct perf_event *event,
3342                u64 *now,
3343                u64 *enabled,
3344                u64 *running)
3345{
3346    u64 ctx_time;
3347
3348    *now = perf_clock();
3349    ctx_time = event->shadow_ctx_time + *now;
3350    *enabled = ctx_time - event->tstamp_enabled;
3351    *running = ctx_time - event->tstamp_running;
3352}
3353
3354void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3355{
3356}
3357
3358/*
3359 * Callers need to ensure there can be no nesting of this function, otherwise
3360 * the seqlock logic goes bad. We can not serialize this because the arch
3361 * code calls this from NMI context.
3362 */
3363void perf_event_update_userpage(struct perf_event *event)
3364{
3365    struct perf_event_mmap_page *userpg;
3366    struct ring_buffer *rb;
3367    u64 enabled, running, now;
3368
3369    rcu_read_lock();
3370    /*
3371     * compute total_time_enabled, total_time_running
3372     * based on snapshot values taken when the event
3373     * was last scheduled in.
3374     *
3375     * we cannot simply called update_context_time()
3376     * because of locking issue as we can be called in
3377     * NMI context
3378     */
3379    calc_timer_values(event, &now, &enabled, &running);
3380    rb = rcu_dereference(event->rb);
3381    if (!rb)
3382        goto unlock;
3383
3384    userpg = rb->user_page;
3385
3386    /*
3387     * Disable preemption so as to not let the corresponding user-space
3388     * spin too long if we get preempted.
3389     */
3390    preempt_disable();
3391    ++userpg->lock;
3392    barrier();
3393    userpg->index = perf_event_index(event);
3394    userpg->offset = perf_event_count(event);
3395    if (userpg->index)
3396        userpg->offset -= local64_read(&event->hw.prev_count);
3397
3398    userpg->time_enabled = enabled +
3399            atomic64_read(&event->child_total_time_enabled);
3400
3401    userpg->time_running = running +
3402            atomic64_read(&event->child_total_time_running);
3403
3404    arch_perf_update_userpage(userpg, now);
3405
3406    barrier();
3407    ++userpg->lock;
3408    preempt_enable();
3409unlock:
3410    rcu_read_unlock();
3411}
3412
3413static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3414{
3415    struct perf_event *event = vma->vm_file->private_data;
3416    struct ring_buffer *rb;
3417    int ret = VM_FAULT_SIGBUS;
3418
3419    if (vmf->flags & FAULT_FLAG_MKWRITE) {
3420        if (vmf->pgoff == 0)
3421            ret = 0;
3422        return ret;
3423    }
3424
3425    rcu_read_lock();
3426    rb = rcu_dereference(event->rb);
3427    if (!rb)
3428        goto unlock;
3429
3430    if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3431        goto unlock;
3432
3433    vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3434    if (!vmf->page)
3435        goto unlock;
3436
3437    get_page(vmf->page);
3438    vmf->page->mapping = vma->vm_file->f_mapping;
3439    vmf->page->index = vmf->pgoff;
3440
3441    ret = 0;
3442unlock:
3443    rcu_read_unlock();
3444
3445    return ret;
3446}
3447
3448static void ring_buffer_attach(struct perf_event *event,
3449                   struct ring_buffer *rb)
3450{
3451    unsigned long flags;
3452
3453    if (!list_empty(&event->rb_entry))
3454        return;
3455
3456    spin_lock_irqsave(&rb->event_lock, flags);
3457    if (!list_empty(&event->rb_entry))
3458        goto unlock;
3459
3460    list_add(&event->rb_entry, &rb->event_list);
3461unlock:
3462    spin_unlock_irqrestore(&rb->event_lock, flags);
3463}
3464
3465static void ring_buffer_detach(struct perf_event *event,
3466                   struct ring_buffer *rb)
3467{
3468    unsigned long flags;
3469
3470    if (list_empty(&event->rb_entry))
3471        return;
3472
3473    spin_lock_irqsave(&rb->event_lock, flags);
3474    list_del_init(&event->rb_entry);
3475    wake_up_all(&event->waitq);
3476    spin_unlock_irqrestore(&rb->event_lock, flags);
3477}
3478
3479static void ring_buffer_wakeup(struct perf_event *event)
3480{
3481    struct ring_buffer *rb;
3482
3483    rcu_read_lock();
3484    rb = rcu_dereference(event->rb);
3485    if (!rb)
3486        goto unlock;
3487
3488    list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3489        wake_up_all(&event->waitq);
3490
3491unlock:
3492    rcu_read_unlock();
3493}
3494
3495static void rb_free_rcu(struct rcu_head *rcu_head)
3496{
3497    struct ring_buffer *rb;
3498
3499    rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3500    rb_free(rb);
3501}
3502
3503static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3504{
3505    struct ring_buffer *rb;
3506
3507    rcu_read_lock();
3508    rb = rcu_dereference(event->rb);
3509    if (rb) {
3510        if (!atomic_inc_not_zero(&rb->refcount))
3511            rb = NULL;
3512    }
3513    rcu_read_unlock();
3514
3515    return rb;
3516}
3517
3518static void ring_buffer_put(struct ring_buffer *rb)
3519{
3520    struct perf_event *event, *n;
3521    unsigned long flags;
3522
3523    if (!atomic_dec_and_test(&rb->refcount))
3524        return;
3525
3526    spin_lock_irqsave(&rb->event_lock, flags);
3527    list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3528        list_del_init(&event->rb_entry);
3529        wake_up_all(&event->waitq);
3530    }
3531    spin_unlock_irqrestore(&rb->event_lock, flags);
3532
3533    call_rcu(&rb->rcu_head, rb_free_rcu);
3534}
3535
3536static void perf_mmap_open(struct vm_area_struct *vma)
3537{
3538    struct perf_event *event = vma->vm_file->private_data;
3539
3540    atomic_inc(&event->mmap_count);
3541}
3542
3543static void perf_mmap_close(struct vm_area_struct *vma)
3544{
3545    struct perf_event *event = vma->vm_file->private_data;
3546
3547    if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3548        unsigned long size = perf_data_size(event->rb);
3549        struct user_struct *user = event->mmap_user;
3550        struct ring_buffer *rb = event->rb;
3551
3552        atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3553        vma->vm_mm->pinned_vm -= event->mmap_locked;
3554        rcu_assign_pointer(event->rb, NULL);
3555        ring_buffer_detach(event, rb);
3556        mutex_unlock(&event->mmap_mutex);
3557
3558        ring_buffer_put(rb);
3559        free_uid(user);
3560    }
3561}
3562
3563static const struct vm_operations_struct perf_mmap_vmops = {
3564    .open = perf_mmap_open,
3565    .close = perf_mmap_close,
3566    .fault = perf_mmap_fault,
3567    .page_mkwrite = perf_mmap_fault,
3568};
3569
3570static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3571{
3572    struct perf_event *event = file->private_data;
3573    unsigned long user_locked, user_lock_limit;
3574    struct user_struct *user = current_user();
3575    unsigned long locked, lock_limit;
3576    struct ring_buffer *rb;
3577    unsigned long vma_size;
3578    unsigned long nr_pages;
3579    long user_extra, extra;
3580    int ret = 0, flags = 0;
3581
3582    /*
3583     * Don't allow mmap() of inherited per-task counters. This would
3584     * create a performance issue due to all children writing to the
3585     * same rb.
3586     */
3587    if (event->cpu == -1 && event->attr.inherit)
3588        return -EINVAL;
3589
3590    if (!(vma->vm_flags & VM_SHARED))
3591        return -EINVAL;
3592
3593    vma_size = vma->vm_end - vma->vm_start;
3594    nr_pages = (vma_size / PAGE_SIZE) - 1;
3595
3596    /*
3597     * If we have rb pages ensure they're a power-of-two number, so we
3598     * can do bitmasks instead of modulo.
3599     */
3600    if (nr_pages != 0 && !is_power_of_2(nr_pages))
3601        return -EINVAL;
3602
3603    if (vma_size != PAGE_SIZE * (1 + nr_pages))
3604        return -EINVAL;
3605
3606    if (vma->vm_pgoff != 0)
3607        return -EINVAL;
3608
3609    WARN_ON_ONCE(event->ctx->parent_ctx);
3610    mutex_lock(&event->mmap_mutex);
3611    if (event->rb) {
3612        if (event->rb->nr_pages == nr_pages)
3613            atomic_inc(&event->rb->refcount);
3614        else
3615            ret = -EINVAL;
3616        goto unlock;
3617    }
3618
3619    user_extra = nr_pages + 1;
3620    user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3621
3622    /*
3623     * Increase the limit linearly with more CPUs:
3624     */
3625    user_lock_limit *= num_online_cpus();
3626
3627    user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3628
3629    extra = 0;
3630    if (user_locked > user_lock_limit)
3631        extra = user_locked - user_lock_limit;
3632
3633    lock_limit = rlimit(RLIMIT_MEMLOCK);
3634    lock_limit >>= PAGE_SHIFT;
3635    locked = vma->vm_mm->pinned_vm + extra;
3636
3637    if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3638        !capable(CAP_IPC_LOCK)) {
3639        ret = -EPERM;
3640        goto unlock;
3641    }
3642
3643    WARN_ON(event->rb);
3644
3645    if (vma->vm_flags & VM_WRITE)
3646        flags |= RING_BUFFER_WRITABLE;
3647
3648    rb = rb_alloc(nr_pages,
3649        event->attr.watermark ? event->attr.wakeup_watermark : 0,
3650        event->cpu, flags);
3651
3652    if (!rb) {
3653        ret = -ENOMEM;
3654        goto unlock;
3655    }
3656    rcu_assign_pointer(event->rb, rb);
3657
3658    atomic_long_add(user_extra, &user->locked_vm);
3659    event->mmap_locked = extra;
3660    event->mmap_user = get_current_user();
3661    vma->vm_mm->pinned_vm += event->mmap_locked;
3662
3663    perf_event_update_userpage(event);
3664
3665unlock:
3666    if (!ret)
3667        atomic_inc(&event->mmap_count);
3668    mutex_unlock(&event->mmap_mutex);
3669
3670    vma->vm_flags |= VM_RESERVED;
3671    vma->vm_ops = &perf_mmap_vmops;
3672
3673    return ret;
3674}
3675
3676static int perf_fasync(int fd, struct file *filp, int on)
3677{
3678    struct inode *inode = filp->f_path.dentry->d_inode;
3679    struct perf_event *event = filp->private_data;
3680    int retval;
3681
3682    mutex_lock(&inode->i_mutex);
3683    retval = fasync_helper(fd, filp, on, &event->fasync);
3684    mutex_unlock(&inode->i_mutex);
3685
3686    if (retval < 0)
3687        return retval;
3688
3689    return 0;
3690}
3691
3692static const struct file_operations perf_fops = {
3693    .llseek = no_llseek,
3694    .release = perf_release,
3695    .read = perf_read,
3696    .poll = perf_poll,
3697    .unlocked_ioctl = perf_ioctl,
3698    .compat_ioctl = perf_ioctl,
3699    .mmap = perf_mmap,
3700    .fasync = perf_fasync,
3701};
3702
3703/*
3704 * Perf event wakeup
3705 *
3706 * If there's data, ensure we set the poll() state and publish everything
3707 * to user-space before waking everybody up.
3708 */
3709
3710void perf_event_wakeup(struct perf_event *event)
3711{
3712    ring_buffer_wakeup(event);
3713
3714    if (event->pending_kill) {
3715        kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3716        event->pending_kill = 0;
3717    }
3718}
3719
3720static void perf_pending_event(struct irq_work *entry)
3721{
3722    struct perf_event *event = container_of(entry,
3723            struct perf_event, pending);
3724
3725    if (event->pending_disable) {
3726        event->pending_disable = 0;
3727        __perf_event_disable(event);
3728    }
3729
3730    if (event->pending_wakeup) {
3731        event->pending_wakeup = 0;
3732        perf_event_wakeup(event);
3733    }
3734}
3735
3736/*
3737 * We assume there is only KVM supporting the callbacks.
3738 * Later on, we might change it to a list if there is
3739 * another virtualization implementation supporting the callbacks.
3740 */
3741struct perf_guest_info_callbacks *perf_guest_cbs;
3742
3743int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3744{
3745    perf_guest_cbs = cbs;
3746    return 0;
3747}
3748EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
3749
3750int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3751{
3752    perf_guest_cbs = NULL;
3753    return 0;
3754}
3755EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3756
3757static void __perf_event_header__init_id(struct perf_event_header *header,
3758                     struct perf_sample_data *data,
3759                     struct perf_event *event)
3760{
3761    u64 sample_type = event->attr.sample_type;
3762
3763    data->type = sample_type;
3764    header->size += event->id_header_size;
3765
3766    if (sample_type & PERF_SAMPLE_TID) {
3767        /* namespace issues */
3768        data->tid_entry.pid = perf_event_pid(event, current);
3769        data->tid_entry.tid = perf_event_tid(event, current);
3770    }
3771
3772    if (sample_type & PERF_SAMPLE_TIME)
3773        data->time = perf_clock();
3774
3775    if (sample_type & PERF_SAMPLE_ID)
3776        data->id = primary_event_id(event);
3777
3778    if (sample_type & PERF_SAMPLE_STREAM_ID)
3779        data->stream_id = event->id;
3780
3781    if (sample_type & PERF_SAMPLE_CPU) {
3782        data->cpu_entry.cpu = raw_smp_processor_id();
3783        data->cpu_entry.reserved = 0;
3784    }
3785}
3786
3787void perf_event_header__init_id(struct perf_event_header *header,
3788                struct perf_sample_data *data,
3789                struct perf_event *event)
3790{
3791    if (event->attr.sample_id_all)
3792        __perf_event_header__init_id(header, data, event);
3793}
3794
3795static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3796                       struct perf_sample_data *data)
3797{
3798    u64 sample_type = data->type;
3799
3800    if (sample_type & PERF_SAMPLE_TID)
3801        perf_output_put(handle, data->tid_entry);
3802
3803    if (sample_type & PERF_SAMPLE_TIME)
3804        perf_output_put(handle, data->time);
3805
3806    if (sample_type & PERF_SAMPLE_ID)
3807        perf_output_put(handle, data->id);
3808
3809    if (sample_type & PERF_SAMPLE_STREAM_ID)
3810        perf_output_put(handle, data->stream_id);
3811
3812    if (sample_type & PERF_SAMPLE_CPU)
3813        perf_output_put(handle, data->cpu_entry);
3814}
3815
3816void perf_event__output_id_sample(struct perf_event *event,
3817                  struct perf_output_handle *handle,
3818                  struct perf_sample_data *sample)
3819{
3820    if (event->attr.sample_id_all)
3821        __perf_event__output_id_sample(handle, sample);
3822}
3823
3824static void perf_output_read_one(struct perf_output_handle *handle,
3825                 struct perf_event *event,
3826                 u64 enabled, u64 running)
3827{
3828    u64 read_format = event->attr.read_format;
3829    u64 values[4];
3830    int n = 0;
3831
3832    values[n++] = perf_event_count(event);
3833    if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3834        values[n++] = enabled +
3835            atomic64_read(&event->child_total_time_enabled);
3836    }
3837    if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3838        values[n++] = running +
3839            atomic64_read(&event->child_total_time_running);
3840    }
3841    if (read_format & PERF_FORMAT_ID)
3842        values[n++] = primary_event_id(event);
3843
3844    __output_copy(handle, values, n * sizeof(u64));
3845}
3846
3847/*
3848 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3849 */
3850static void perf_output_read_group(struct perf_output_handle *handle,
3851                struct perf_event *event,
3852                u64 enabled, u64 running)
3853{
3854    struct perf_event *leader = event->group_leader, *sub;
3855    u64 read_format = event->attr.read_format;
3856    u64 values[5];
3857    int n = 0;
3858
3859    values[n++] = 1 + leader->nr_siblings;
3860
3861    if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3862        values[n++] = enabled;
3863
3864    if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3865        values[n++] = running;
3866
3867    if (leader != event)
3868        leader->pmu->read(leader);
3869
3870    values[n++] = perf_event_count(leader);
3871    if (read_format & PERF_FORMAT_ID)
3872        values[n++] = primary_event_id(leader);
3873
3874    __output_copy(handle, values, n * sizeof(u64));
3875
3876    list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3877        n = 0;
3878
3879        if (sub != event)
3880            sub->pmu->read(sub);
3881
3882        values[n++] = perf_event_count(sub);
3883        if (read_format & PERF_FORMAT_ID)
3884            values[n++] = primary_event_id(sub);
3885
3886        __output_copy(handle, values, n * sizeof(u64));
3887    }
3888}
3889
3890#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3891                 PERF_FORMAT_TOTAL_TIME_RUNNING)
3892
3893static void perf_output_read(struct perf_output_handle *handle,
3894                 struct perf_event *event)
3895{
3896    u64 enabled = 0, running = 0, now;
3897    u64 read_format = event->attr.read_format;
3898
3899    /*
3900     * compute total_time_enabled, total_time_running
3901     * based on snapshot values taken when the event
3902     * was last scheduled in.
3903     *
3904     * we cannot simply called update_context_time()
3905     * because of locking issue as we are called in
3906     * NMI context
3907     */
3908    if (read_format & PERF_FORMAT_TOTAL_TIMES)
3909        calc_timer_values(event, &now, &enabled, &running);
3910
3911    if (event->attr.read_format & PERF_FORMAT_GROUP)
3912        perf_output_read_group(handle, event, enabled, running);
3913    else
3914        perf_output_read_one(handle, event, enabled, running);
3915}
3916
3917void perf_output_sample(struct perf_output_handle *handle,
3918            struct perf_event_header *header,
3919            struct perf_sample_data *data,
3920            struct perf_event *event)
3921{
3922    u64 sample_type = data->type;
3923
3924    perf_output_put(handle, *header);
3925
3926    if (sample_type & PERF_SAMPLE_IP)
3927        perf_output_put(handle, data->ip);
3928
3929    if (sample_type & PERF_SAMPLE_TID)
3930        perf_output_put(handle, data->tid_entry);
3931
3932    if (sample_type & PERF_SAMPLE_TIME)
3933        perf_output_put(handle, data->time);
3934
3935    if (sample_type & PERF_SAMPLE_ADDR)
3936        perf_output_put(handle, data->addr);
3937
3938    if (sample_type & PERF_SAMPLE_ID)
3939        perf_output_put(handle, data->id);
3940
3941    if (sample_type & PERF_SAMPLE_STREAM_ID)
3942        perf_output_put(handle, data->stream_id);
3943
3944    if (sample_type & PERF_SAMPLE_CPU)
3945        perf_output_put(handle, data->cpu_entry);
3946
3947    if (sample_type & PERF_SAMPLE_PERIOD)
3948        perf_output_put(handle, data->period);
3949
3950    if (sample_type & PERF_SAMPLE_READ)
3951        perf_output_read(handle, event);
3952
3953    if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3954        if (data->callchain) {
3955            int size = 1;
3956
3957            if (data->callchain)
3958                size += data->callchain->nr;
3959
3960            size *= sizeof(u64);
3961
3962            __output_copy(handle, data->callchain, size);
3963        } else {
3964            u64 nr = 0;
3965            perf_output_put(handle, nr);
3966        }
3967    }
3968
3969    if (sample_type & PERF_SAMPLE_RAW) {
3970        if (data->raw) {
3971            perf_output_put(handle, data->raw->size);
3972            __output_copy(handle, data->raw->data,
3973                       data->raw->size);
3974        } else {
3975            struct {
3976                u32 size;
3977                u32 data;
3978            } raw = {
3979                .size = sizeof(u32),
3980                .data = 0,
3981            };
3982            perf_output_put(handle, raw);
3983        }
3984    }
3985
3986    if (!event->attr.watermark) {
3987        int wakeup_events = event->attr.wakeup_events;
3988
3989        if (wakeup_events) {
3990            struct ring_buffer *rb = handle->rb;
3991            int events = local_inc_return(&rb->events);
3992
3993            if (events >= wakeup_events) {
3994                local_sub(wakeup_events, &rb->events);
3995                local_inc(&rb->wakeup);
3996            }
3997        }
3998    }
3999
4000    if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4001        if (data->br_stack) {
4002            size_t size;
4003
4004            size = data->br_stack->nr
4005                 * sizeof(struct perf_branch_entry);
4006
4007            perf_output_put(handle, data->br_stack->nr);
4008            perf_output_copy(handle, data->br_stack->entries, size);
4009        } else {
4010            /*
4011             * we always store at least the value of nr
4012             */
4013            u64 nr = 0;
4014            perf_output_put(handle, nr);
4015        }
4016    }
4017}
4018
4019void perf_prepare_sample(struct perf_event_header *header,
4020             struct perf_sample_data *data,
4021             struct perf_event *event,
4022             struct pt_regs *regs)
4023{
4024    u64 sample_type = event->attr.sample_type;
4025
4026    header->type = PERF_RECORD_SAMPLE;
4027    header->size = sizeof(*header) + event->header_size;
4028
4029    header->misc = 0;
4030    header->misc |= perf_misc_flags(regs);
4031
4032    __perf_event_header__init_id(header, data, event);
4033
4034    if (sample_type & PERF_SAMPLE_IP)
4035        data->ip = perf_instruction_pointer(regs);
4036
4037    if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4038        int size = 1;
4039
4040        data->callchain = perf_callchain(regs);
4041
4042        if (data->callchain)
4043            size += data->callchain->nr;
4044
4045        header->size += size * sizeof(u64);
4046    }
4047
4048    if (sample_type & PERF_SAMPLE_RAW) {
4049        int size = sizeof(u32);
4050
4051        if (data->raw)
4052            size += data->raw->size;
4053        else
4054            size += sizeof(u32);
4055
4056        WARN_ON_ONCE(size & (sizeof(u64)-1));
4057        header->size += size;
4058    }
4059
4060    if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4061        int size = sizeof(u64); /* nr */
4062        if (data->br_stack) {
4063            size += data->br_stack->nr
4064                  * sizeof(struct perf_branch_entry);
4065        }
4066        header->size += size;
4067    }
4068}
4069
4070static void perf_event_output(struct perf_event *event,
4071                struct perf_sample_data *data,
4072                struct pt_regs *regs)
4073{
4074    struct perf_output_handle handle;
4075    struct perf_event_header header;
4076
4077    /* protect the callchain buffers */
4078    rcu_read_lock();
4079
4080    perf_prepare_sample(&header, data, event, regs);
4081
4082    if (perf_output_begin(&handle, event, header.size))
4083        goto exit;
4084
4085    perf_output_sample(&handle, &header, data, event);
4086
4087    perf_output_end(&handle);
4088
4089exit:
4090    rcu_read_unlock();
4091}
4092
4093/*
4094 * read event_id
4095 */
4096
4097struct perf_read_event {
4098    struct perf_event_header header;
4099
4100    u32 pid;
4101    u32 tid;
4102};
4103
4104static void
4105perf_event_read_event(struct perf_event *event,
4106            struct task_struct *task)
4107{
4108    struct perf_output_handle handle;
4109    struct perf_sample_data sample;
4110    struct perf_read_event read_event = {
4111        .header = {
4112            .type = PERF_RECORD_READ,
4113            .misc = 0,
4114            .size = sizeof(read_event) + event->read_size,
4115        },
4116        .pid = perf_event_pid(event, task),
4117        .tid = perf_event_tid(event, task),
4118    };
4119    int ret;
4120
4121    perf_event_header__init_id(&read_event.header, &sample, event);
4122    ret = perf_output_begin(&handle, event, read_event.header.size);
4123    if (ret)
4124        return;
4125
4126    perf_output_put(&handle, read_event);
4127    perf_output_read(&handle, event);
4128    perf_event__output_id_sample(event, &handle, &sample);
4129
4130    perf_output_end(&handle);
4131}
4132
4133/*
4134 * task tracking -- fork/exit
4135 *
4136 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
4137 */
4138
4139struct perf_task_event {
4140    struct task_struct *task;
4141    struct perf_event_context *task_ctx;
4142
4143    struct {
4144        struct perf_event_header header;
4145
4146        u32 pid;
4147        u32 ppid;
4148        u32 tid;
4149        u32 ptid;
4150        u64 time;
4151    } event_id;
4152};
4153
4154static void perf_event_task_output(struct perf_event *event,
4155                     struct perf_task_event *task_event)
4156{
4157    struct perf_output_handle handle;
4158    struct perf_sample_data sample;
4159    struct task_struct *task = task_event->task;
4160    int ret, size = task_event->event_id.header.size;
4161
4162    perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4163
4164    ret = perf_output_begin(&handle, event,
4165                task_event->event_id.header.size);
4166    if (ret)
4167        goto out;
4168
4169    task_event->event_id.pid = perf_event_pid(event, task);
4170    task_event->event_id.ppid = perf_event_pid(event, current);
4171
4172    task_event->event_id.tid = perf_event_tid(event, task);
4173    task_event->event_id.ptid = perf_event_tid(event, current);
4174
4175    perf_output_put(&handle, task_event->event_id);
4176
4177    perf_event__output_id_sample(event, &handle, &sample);
4178
4179    perf_output_end(&handle);
4180out:
4181    task_event->event_id.header.size = size;
4182}
4183
4184static int perf_event_task_match(struct perf_event *event)
4185{
4186    if (event->state < PERF_EVENT_STATE_INACTIVE)
4187        return 0;
4188
4189    if (!event_filter_match(event))
4190        return 0;
4191
4192    if (event->attr.comm || event->attr.mmap ||
4193        event->attr.mmap_data || event->attr.task)
4194        return 1;
4195
4196    return 0;
4197}
4198
4199static void perf_event_task_ctx(struct perf_event_context *ctx,
4200                  struct perf_task_event *task_event)
4201{
4202    struct perf_event *event;
4203
4204    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4205        if (perf_event_task_match(event))
4206            perf_event_task_output(event, task_event);
4207    }
4208}
4209
4210static void perf_event_task_event(struct perf_task_event *task_event)
4211{
4212    struct perf_cpu_context *cpuctx;
4213    struct perf_event_context *ctx;
4214    struct pmu *pmu;
4215    int ctxn;
4216
4217    rcu_read_lock();
4218    list_for_each_entry_rcu(pmu, &pmus, entry) {
4219        cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4220        if (cpuctx->active_pmu != pmu)
4221            goto next;
4222        perf_event_task_ctx(&cpuctx->ctx, task_event);
4223
4224        ctx = task_event->task_ctx;
4225        if (!ctx) {
4226            ctxn = pmu->task_ctx_nr;
4227            if (ctxn < 0)
4228                goto next;
4229            ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4230        }
4231        if (ctx)
4232            perf_event_task_ctx(ctx, task_event);
4233next:
4234        put_cpu_ptr(pmu->pmu_cpu_context);
4235    }
4236    rcu_read_unlock();
4237}
4238
4239static void perf_event_task(struct task_struct *task,
4240                  struct perf_event_context *task_ctx,
4241                  int new)
4242{
4243    struct perf_task_event task_event;
4244
4245    if (!atomic_read(&nr_comm_events) &&
4246        !atomic_read(&nr_mmap_events) &&
4247        !atomic_read(&nr_task_events))
4248        return;
4249
4250    task_event = (struct perf_task_event){
4251        .task = task,
4252        .task_ctx = task_ctx,
4253        .event_id = {
4254            .header = {
4255                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4256                .misc = 0,
4257                .size = sizeof(task_event.event_id),
4258            },
4259            /* .pid */
4260            /* .ppid */
4261            /* .tid */
4262            /* .ptid */
4263            .time = perf_clock(),
4264        },
4265    };
4266
4267    perf_event_task_event(&task_event);
4268}
4269
4270void perf_event_fork(struct task_struct *task)
4271{
4272    perf_event_task(task, NULL, 1);
4273}
4274
4275/*
4276 * comm tracking
4277 */
4278
4279struct perf_comm_event {
4280    struct task_struct *task;
4281    char *comm;
4282    int comm_size;
4283
4284    struct {
4285        struct perf_event_header header;
4286
4287        u32 pid;
4288        u32 tid;
4289    } event_id;
4290};
4291
4292static void perf_event_comm_output(struct perf_event *event,
4293                     struct perf_comm_event *comm_event)
4294{
4295    struct perf_output_handle handle;
4296    struct perf_sample_data sample;
4297    int size = comm_event->event_id.header.size;
4298    int ret;
4299
4300    perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4301    ret = perf_output_begin(&handle, event,
4302                comm_event->event_id.header.size);
4303
4304    if (ret)
4305        goto out;
4306
4307    comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4308    comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4309
4310    perf_output_put(&handle, comm_event->event_id);
4311    __output_copy(&handle, comm_event->comm,
4312                   comm_event->comm_size);
4313
4314    perf_event__output_id_sample(event, &handle, &sample);
4315
4316    perf_output_end(&handle);
4317out:
4318    comm_event->event_id.header.size = size;
4319}
4320
4321static int perf_event_comm_match(struct perf_event *event)
4322{
4323    if (event->state < PERF_EVENT_STATE_INACTIVE)
4324        return 0;
4325
4326    if (!event_filter_match(event))
4327        return 0;
4328
4329    if (event->attr.comm)
4330        return 1;
4331
4332    return 0;
4333}
4334
4335static void perf_event_comm_ctx(struct perf_event_context *ctx,
4336                  struct perf_comm_event *comm_event)
4337{
4338    struct perf_event *event;
4339
4340    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4341        if (perf_event_comm_match(event))
4342            perf_event_comm_output(event, comm_event);
4343    }
4344}
4345
4346static void perf_event_comm_event(struct perf_comm_event *comm_event)
4347{
4348    struct perf_cpu_context *cpuctx;
4349    struct perf_event_context *ctx;
4350    char comm[TASK_COMM_LEN];
4351    unsigned int size;
4352    struct pmu *pmu;
4353    int ctxn;
4354
4355    memset(comm, 0, sizeof(comm));
4356    strlcpy(comm, comm_event->task->comm, sizeof(comm));
4357    size = ALIGN(strlen(comm)+1, sizeof(u64));
4358
4359    comm_event->comm = comm;
4360    comm_event->comm_size = size;
4361
4362    comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4363    rcu_read_lock();
4364    list_for_each_entry_rcu(pmu, &pmus, entry) {
4365        cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4366        if (cpuctx->active_pmu != pmu)
4367            goto next;
4368        perf_event_comm_ctx(&cpuctx->ctx, comm_event);
4369
4370        ctxn = pmu->task_ctx_nr;
4371        if (ctxn < 0)
4372            goto next;
4373
4374        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4375        if (ctx)
4376            perf_event_comm_ctx(ctx, comm_event);
4377next:
4378        put_cpu_ptr(pmu->pmu_cpu_context);
4379    }
4380    rcu_read_unlock();
4381}
4382
4383void perf_event_comm(struct task_struct *task)
4384{
4385    struct perf_comm_event comm_event;
4386    struct perf_event_context *ctx;
4387    int ctxn;
4388
4389    for_each_task_context_nr(ctxn) {
4390        ctx = task->perf_event_ctxp[ctxn];
4391        if (!ctx)
4392            continue;
4393
4394        perf_event_enable_on_exec(ctx);
4395    }
4396
4397    if (!atomic_read(&nr_comm_events))
4398        return;
4399
4400    comm_event = (struct perf_comm_event){
4401        .task = task,
4402        /* .comm */
4403        /* .comm_size */
4404        .event_id = {
4405            .header = {
4406                .type = PERF_RECORD_COMM,
4407                .misc = 0,
4408                /* .size */
4409            },
4410            /* .pid */
4411            /* .tid */
4412        },
4413    };
4414
4415    perf_event_comm_event(&comm_event);
4416}
4417
4418/*
4419 * mmap tracking
4420 */
4421
4422struct perf_mmap_event {
4423    struct vm_area_struct *vma;
4424
4425    const char *file_name;
4426    int file_size;
4427
4428    struct {
4429        struct perf_event_header header;
4430
4431        u32 pid;
4432        u32 tid;
4433        u64 start;
4434        u64 len;
4435        u64 pgoff;
4436    } event_id;
4437};
4438
4439static void perf_event_mmap_output(struct perf_event *event,
4440                     struct perf_mmap_event *mmap_event)
4441{
4442    struct perf_output_handle handle;
4443    struct perf_sample_data sample;
4444    int size = mmap_event->event_id.header.size;
4445    int ret;
4446
4447    perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4448    ret = perf_output_begin(&handle, event,
4449                mmap_event->event_id.header.size);
4450    if (ret)
4451        goto out;
4452
4453    mmap_event->event_id.pid = perf_event_pid(event, current);
4454    mmap_event->event_id.tid = perf_event_tid(event, current);
4455
4456    perf_output_put(&handle, mmap_event->event_id);
4457    __output_copy(&handle, mmap_event->file_name,
4458                   mmap_event->file_size);
4459
4460    perf_event__output_id_sample(event, &handle, &sample);
4461
4462    perf_output_end(&handle);
4463out:
4464    mmap_event->event_id.header.size = size;
4465}
4466
4467static int perf_event_mmap_match(struct perf_event *event,
4468                   struct perf_mmap_event *mmap_event,
4469                   int executable)
4470{
4471    if (event->state < PERF_EVENT_STATE_INACTIVE)
4472        return 0;
4473
4474    if (!event_filter_match(event))
4475        return 0;
4476
4477    if ((!executable && event->attr.mmap_data) ||
4478        (executable && event->attr.mmap))
4479        return 1;
4480
4481    return 0;
4482}
4483
4484static void perf_event_mmap_ctx(struct perf_event_context *ctx,
4485                  struct perf_mmap_event *mmap_event,
4486                  int executable)
4487{
4488    struct perf_event *event;
4489
4490    list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4491        if (perf_event_mmap_match(event, mmap_event, executable))
4492            perf_event_mmap_output(event, mmap_event);
4493    }
4494}
4495
4496static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4497{
4498    struct perf_cpu_context *cpuctx;
4499    struct perf_event_context *ctx;
4500    struct vm_area_struct *vma = mmap_event->vma;
4501    struct file *file = vma->vm_file;
4502    unsigned int size;
4503    char tmp[16];
4504    char *buf = NULL;
4505    const char *name;
4506    struct pmu *pmu;
4507    int ctxn;
4508
4509    memset(tmp, 0, sizeof(tmp));
4510
4511    if (file) {
4512        /*
4513         * d_path works from the end of the rb backwards, so we
4514         * need to add enough zero bytes after the string to handle
4515         * the 64bit alignment we do later.
4516         */
4517        buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4518        if (!buf) {
4519            name = strncpy(tmp, "//enomem", sizeof(tmp));
4520            goto got_name;
4521        }
4522        name = d_path(&file->f_path, buf, PATH_MAX);
4523        if (IS_ERR(name)) {
4524            name = strncpy(tmp, "//toolong", sizeof(tmp));
4525            goto got_name;
4526        }
4527    } else {
4528        if (arch_vma_name(mmap_event->vma)) {
4529            name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4530                       sizeof(tmp));
4531            goto got_name;
4532        }
4533
4534        if (!vma->vm_mm) {
4535            name = strncpy(tmp, "[vdso]", sizeof(tmp));
4536            goto got_name;
4537        } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4538                vma->vm_end >= vma->vm_mm->brk) {
4539            name = strncpy(tmp, "[heap]", sizeof(tmp));
4540            goto got_name;
4541        } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4542                vma->vm_end >= vma->vm_mm->start_stack) {
4543            name = strncpy(tmp, "[stack]", sizeof(tmp));
4544            goto got_name;
4545        }
4546
4547        name = strncpy(tmp, "//anon", sizeof(tmp));
4548        goto got_name;
4549    }
4550
4551got_name:
4552    size = ALIGN(strlen(name)+1, sizeof(u64));
4553
4554    mmap_event->file_name = name;
4555    mmap_event->file_size = size;
4556
4557    mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4558
4559    rcu_read_lock();
4560    list_for_each_entry_rcu(pmu, &pmus, entry) {
4561        cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4562        if (cpuctx->active_pmu != pmu)
4563            goto next;
4564        perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4565                    vma->vm_flags & VM_EXEC);
4566
4567        ctxn = pmu->task_ctx_nr;
4568        if (ctxn < 0)
4569            goto next;
4570
4571        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4572        if (ctx) {
4573            perf_event_mmap_ctx(ctx, mmap_event,
4574                    vma->vm_flags & VM_EXEC);
4575        }
4576next:
4577        put_cpu_ptr(pmu->pmu_cpu_context);
4578    }
4579    rcu_read_unlock();
4580
4581    kfree(buf);
4582}
4583
4584void perf_event_mmap(struct vm_area_struct *vma)
4585{
4586    struct perf_mmap_event mmap_event;
4587
4588    if (!atomic_read(&nr_mmap_events))
4589        return;
4590
4591    mmap_event = (struct perf_mmap_event){
4592        .vma = vma,
4593        /* .file_name */
4594        /* .file_size */
4595        .event_id = {
4596            .header = {
4597                .type = PERF_RECORD_MMAP,
4598                .misc = PERF_RECORD_MISC_USER,
4599                /* .size */
4600            },
4601            /* .pid */
4602            /* .tid */
4603            .start = vma->vm_start,
4604            .len = vma->vm_end - vma->vm_start,
4605            .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
4606        },
4607    };
4608
4609    perf_event_mmap_event(&mmap_event);
4610}
4611
4612/*
4613 * IRQ throttle logging
4614 */
4615
4616static void perf_log_throttle(struct perf_event *event, int enable)
4617{
4618    struct perf_output_handle handle;
4619    struct perf_sample_data sample;
4620    int ret;
4621
4622    struct {
4623        struct perf_event_header header;
4624        u64 time;
4625        u64 id;
4626        u64 stream_id;
4627    } throttle_event = {
4628        .header = {
4629            .type = PERF_RECORD_THROTTLE,
4630            .misc = 0,
4631            .size = sizeof(throttle_event),
4632        },
4633        .time = perf_clock(),
4634        .id = primary_event_id(event),
4635        .stream_id = event->id,
4636    };
4637
4638    if (enable)
4639        throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4640
4641    perf_event_header__init_id(&throttle_event.header, &sample, event);
4642
4643    ret = perf_output_begin(&handle, event,
4644                throttle_event.header.size);
4645    if (ret)
4646        return;
4647
4648    perf_output_put(&handle, throttle_event);
4649    perf_event__output_id_sample(event, &handle, &sample);
4650    perf_output_end(&handle);
4651}
4652
4653/*
4654 * Generic event overflow handling, sampling.
4655 */
4656
4657static int __perf_event_overflow(struct perf_event *event,
4658                   int throttle, struct perf_sample_data *data,
4659                   struct pt_regs *regs)
4660{
4661    int events = atomic_read(&event->event_limit);
4662    struct hw_perf_event *hwc = &event->hw;
4663    u64 seq;
4664    int ret = 0;
4665
4666    /*
4667     * Non-sampling counters might still use the PMI to fold short
4668     * hardware counters, ignore those.
4669     */
4670    if (unlikely(!is_sampling_event(event)))
4671        return 0;
4672
4673    seq = __this_cpu_read(perf_throttled_seq);
4674    if (seq != hwc->interrupts_seq) {
4675        hwc->interrupts_seq = seq;
4676        hwc->interrupts = 1;
4677    } else {
4678        hwc->interrupts++;
4679        if (unlikely(throttle
4680                 && hwc->interrupts >= max_samples_per_tick)) {
4681            __this_cpu_inc(perf_throttled_count);
4682            hwc->interrupts = MAX_INTERRUPTS;
4683            perf_log_throttle(event, 0);
4684            ret = 1;
4685        }
4686    }
4687
4688    if (event->attr.freq) {
4689        u64 now = perf_clock();
4690        s64 delta = now - hwc->freq_time_stamp;
4691
4692        hwc->freq_time_stamp = now;
4693
4694        if (delta > 0 && delta < 2*TICK_NSEC)
4695            perf_adjust_period(event, delta, hwc->last_period, true);
4696    }
4697
4698    /*
4699     * XXX event_limit might not quite work as expected on inherited
4700     * events
4701     */
4702
4703    event->pending_kill = POLL_IN;
4704    if (events && atomic_dec_and_test(&event->event_limit)) {
4705        ret = 1;
4706        event->pending_kill = POLL_HUP;
4707        event->pending_disable = 1;
4708        irq_work_queue(&event->pending);
4709    }
4710
4711    if (event->overflow_handler)
4712        event->overflow_handler(event, data, regs);
4713    else
4714        perf_event_output(event, data, regs);
4715
4716    if (event->fasync && event->pending_kill) {
4717        event->pending_wakeup = 1;
4718        irq_work_queue(&event->pending);
4719    }
4720
4721    return ret;
4722}
4723
4724int perf_event_overflow(struct perf_event *event,
4725              struct perf_sample_data *data,
4726              struct pt_regs *regs)
4727{
4728    return __perf_event_overflow(event, 1, data, regs);
4729}
4730
4731/*
4732 * Generic software event infrastructure
4733 */
4734
4735struct swevent_htable {
4736    struct swevent_hlist *swevent_hlist;
4737    struct mutex hlist_mutex;
4738    int hlist_refcount;
4739
4740    /* Recursion avoidance in each contexts */
4741    int recursion[PERF_NR_CONTEXTS];
4742};
4743
4744static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
4745
4746/*
4747 * We directly increment event->count and keep a second value in
4748 * event->hw.period_left to count intervals. This period event
4749 * is kept in the range [-sample_period, 0] so that we can use the
4750 * sign as trigger.
4751 */
4752
4753static u64 perf_swevent_set_period(struct perf_event *event)
4754{
4755    struct hw_perf_event *hwc = &event->hw;
4756    u64 period = hwc->last_period;
4757    u64 nr, offset;
4758    s64 old, val;
4759
4760    hwc->last_period = hwc->sample_period;
4761
4762again:
4763    old = val = local64_read(&hwc->period_left);
4764    if (val < 0)
4765        return 0;
4766
4767    nr = div64_u64(period + val, period);
4768    offset = nr * period;
4769    val -= offset;
4770    if (local64_cmpxchg(&hwc->period_left, old, val) != old)
4771        goto again;
4772
4773    return nr;
4774}
4775
4776static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4777                    struct perf_sample_data *data,
4778                    struct pt_regs *regs)
4779{
4780    struct hw_perf_event *hwc = &event->hw;
4781    int throttle = 0;
4782
4783    if (!overflow)
4784        overflow = perf_swevent_set_period(event);
4785
4786    if (hwc->interrupts == MAX_INTERRUPTS)
4787        return;
4788
4789    for (; overflow; overflow--) {
4790        if (__perf_event_overflow(event, throttle,
4791                        data, regs)) {
4792            /*
4793             * We inhibit the overflow from happening when
4794             * hwc->interrupts == MAX_INTERRUPTS.
4795             */
4796            break;
4797        }
4798        throttle = 1;
4799    }
4800}
4801
4802static void perf_swevent_event(struct perf_event *event, u64 nr,
4803                   struct perf_sample_data *data,
4804                   struct pt_regs *regs)
4805{
4806    struct hw_perf_event *hwc = &event->hw;
4807
4808    local64_add(nr, &event->count);
4809
4810    if (!regs)
4811        return;
4812
4813    if (!is_sampling_event(event))
4814        return;
4815
4816    if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
4817        data->period = nr;
4818        return perf_swevent_overflow(event, 1, data, regs);
4819    } else
4820        data->period = event->hw.last_period;
4821
4822    if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4823        return perf_swevent_overflow(event, 1, data, regs);
4824
4825    if (local64_add_negative(nr, &hwc->period_left))
4826        return;
4827
4828    perf_swevent_overflow(event, 0, data, regs);
4829}
4830
4831static int perf_exclude_event(struct perf_event *event,
4832                  struct pt_regs *regs)
4833{
4834    if (event->hw.state & PERF_HES_STOPPED)
4835        return 1;
4836
4837    if (regs) {
4838        if (event->attr.exclude_user && user_mode(regs))
4839            return 1;
4840
4841        if (event->attr.exclude_kernel && !user_mode(regs))
4842            return 1;
4843    }
4844
4845    return 0;
4846}
4847
4848static int perf_swevent_match(struct perf_event *event,
4849                enum perf_type_id type,
4850                u32 event_id,
4851                struct perf_sample_data *data,
4852                struct pt_regs *regs)
4853{
4854    if (event->attr.type != type)
4855        return 0;
4856
4857    if (event->attr.config != event_id)
4858        return 0;
4859
4860    if (perf_exclude_event(event, regs))
4861        return 0;
4862
4863    return 1;
4864}
4865
4866static inline u64 swevent_hash(u64 type, u32 event_id)
4867{
4868    u64 val = event_id | (type << 32);
4869
4870    return hash_64(val, SWEVENT_HLIST_BITS);
4871}
4872
4873static inline struct hlist_head *
4874__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4875{
4876    u64 hash = swevent_hash(type, event_id);
4877
4878    return &hlist->heads[hash];
4879}
4880
4881/* For the read side: events when they trigger */
4882static inline struct hlist_head *
4883find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
4884{
4885    struct swevent_hlist *hlist;
4886
4887    hlist = rcu_dereference(swhash->swevent_hlist);
4888    if (!hlist)
4889        return NULL;
4890
4891    return __find_swevent_head(hlist, type, event_id);
4892}
4893
4894/* For the event head insertion and removal in the hlist */
4895static inline struct hlist_head *
4896find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
4897{
4898    struct swevent_hlist *hlist;
4899    u32 event_id = event->attr.config;
4900    u64 type = event->attr.type;
4901
4902    /*
4903     * Event scheduling is always serialized against hlist allocation
4904     * and release. Which makes the protected version suitable here.
4905     * The context lock guarantees that.
4906     */
4907    hlist = rcu_dereference_protected(swhash->swevent_hlist,
4908                      lockdep_is_held(&event->ctx->lock));
4909    if (!hlist)
4910        return NULL;
4911
4912    return __find_swevent_head(hlist, type, event_id);
4913}
4914
4915static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4916                    u64 nr,
4917                    struct perf_sample_data *data,
4918                    struct pt_regs *regs)
4919{
4920    struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4921    struct perf_event *event;
4922    struct hlist_node *node;
4923    struct hlist_head *head;
4924
4925    rcu_read_lock();
4926    head = find_swevent_head_rcu(swhash, type, event_id);
4927    if (!head)
4928        goto end;
4929
4930    hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4931        if (perf_swevent_match(event, type, event_id, data, regs))
4932            perf_swevent_event(event, nr, data, regs);
4933    }
4934end:
4935    rcu_read_unlock();
4936}
4937
4938int perf_swevent_get_recursion_context(void)
4939{
4940    struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4941
4942    return get_recursion_context(swhash->recursion);
4943}
4944EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4945
4946inline void perf_swevent_put_recursion_context(int rctx)
4947{
4948    struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4949
4950    put_recursion_context(swhash->recursion, rctx);
4951}
4952
4953void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
4954{
4955    struct perf_sample_data data;
4956    int rctx;
4957
4958    preempt_disable_notrace();
4959    rctx = perf_swevent_get_recursion_context();
4960    if (rctx < 0)
4961        return;
4962
4963    perf_sample_data_init(&data, addr, 0);
4964
4965    do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
4966
4967    perf_swevent_put_recursion_context(rctx);
4968    preempt_enable_notrace();
4969}
4970
4971static void perf_swevent_read(struct perf_event *event)
4972{
4973}
4974
4975static int perf_swevent_add(struct perf_event *event, int flags)
4976{
4977    struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4978    struct hw_perf_event *hwc = &event->hw;
4979    struct hlist_head *head;
4980
4981    if (is_sampling_event(event)) {
4982        hwc->last_period = hwc->sample_period;
4983        perf_swevent_set_period(event);
4984    }
4985
4986    hwc->state = !(flags & PERF_EF_START);
4987
4988    head = find_swevent_head(swhash, event);
4989    if (WARN_ON_ONCE(!head))
4990        return -EINVAL;
4991
4992    hlist_add_head_rcu(&event->hlist_entry, head);
4993
4994    return 0;
4995}
4996
4997static void perf_swevent_del(struct perf_event *event, int flags)
4998{
4999    hlist_del_rcu(&event->hlist_entry);
5000}
5001
5002static void perf_swevent_start(struct perf_event *event, int flags)
5003{
5004    event->hw.state = 0;
5005}
5006
5007static void perf_swevent_stop(struct perf_event *event, int flags)
5008{
5009    event->hw.state = PERF_HES_STOPPED;
5010}
5011
5012/* Deref the hlist from the update side */
5013static inline struct swevent_hlist *
5014swevent_hlist_deref(struct swevent_htable *swhash)
5015{
5016    return rcu_dereference_protected(swhash->swevent_hlist,
5017                     lockdep_is_held(&swhash->hlist_mutex));
5018}
5019
5020static void swevent_hlist_release(struct swevent_htable *swhash)
5021{
5022    struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
5023
5024    if (!hlist)
5025        return;
5026
5027    rcu_assign_pointer(swhash->swevent_hlist, NULL);
5028    kfree_rcu(hlist, rcu_head);
5029}
5030
5031static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5032{
5033    struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5034
5035    mutex_lock(&swhash->hlist_mutex);
5036
5037    if (!--swhash->hlist_refcount)
5038        swevent_hlist_release(swhash);
5039
5040    mutex_unlock(&swhash->hlist_mutex);
5041}
5042
5043static void swevent_hlist_put(struct perf_event *event)
5044{
5045    int cpu;
5046
5047    if (event->cpu != -1) {
5048        swevent_hlist_put_cpu(event, event->cpu);
5049        return;
5050    }
5051
5052    for_each_possible_cpu(cpu)
5053        swevent_hlist_put_cpu(event, cpu);
5054}
5055
5056static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5057{
5058    struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5059    int err = 0;
5060
5061    mutex_lock(&swhash->hlist_mutex);
5062
5063    if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
5064        struct swevent_hlist *hlist;
5065
5066        hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5067        if (!hlist) {
5068            err = -ENOMEM;
5069            goto exit;
5070        }
5071        rcu_assign_pointer(swhash->swevent_hlist, hlist);
5072    }
5073    swhash->hlist_refcount++;
5074exit:
5075    mutex_unlock(&swhash->hlist_mutex);
5076
5077    return err;
5078}
5079
5080static int swevent_hlist_get(struct perf_event *event)
5081{
5082    int err;
5083    int cpu, failed_cpu;
5084
5085    if (event->cpu != -1)
5086        return swevent_hlist_get_cpu(event, event->cpu);
5087
5088    get_online_cpus();
5089    for_each_possible_cpu(cpu) {
5090        err = swevent_hlist_get_cpu(event, cpu);
5091        if (err) {
5092            failed_cpu = cpu;
5093            goto fail;
5094        }
5095    }
5096    put_online_cpus();
5097
5098    return 0;
5099fail:
5100    for_each_possible_cpu(cpu) {
5101        if (cpu == failed_cpu)
5102            break;
5103        swevent_hlist_put_cpu(event, cpu);
5104    }
5105
5106    put_online_cpus();
5107    return err;
5108}
5109
5110struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5111
5112static void sw_perf_event_destroy(struct perf_event *event)
5113{
5114    u64 event_id = event->attr.config;
5115
5116    WARN_ON(event->parent);
5117
5118    static_key_slow_dec(&perf_swevent_enabled[event_id]);
5119    swevent_hlist_put(event);
5120}
5121
5122static int perf_swevent_init(struct perf_event *event)
5123{
5124    int event_id = event->attr.config;
5125
5126    if (event->attr.type != PERF_TYPE_SOFTWARE)
5127        return -ENOENT;
5128
5129    /*
5130     * no branch sampling for software events
5131     */
5132    if (has_branch_stack(event))
5133        return -EOPNOTSUPP;
5134
5135    switch (event_id) {
5136    case PERF_COUNT_SW_CPU_CLOCK:
5137    case PERF_COUNT_SW_TASK_CLOCK:
5138        return -ENOENT;
5139
5140    default:
5141        break;
5142    }
5143
5144    if (event_id >= PERF_COUNT_SW_MAX)
5145        return -ENOENT;
5146
5147    if (!event->parent) {
5148        int err;
5149
5150        err = swevent_hlist_get(event);
5151        if (err)
5152            return err;
5153
5154        static_key_slow_inc(&perf_swevent_enabled[event_id]);
5155        event->destroy = sw_perf_event_destroy;
5156    }
5157
5158    return 0;
5159}
5160
5161static int perf_swevent_event_idx(struct perf_event *event)
5162{
5163    return 0;
5164}
5165
5166static struct pmu perf_swevent = {
5167    .task_ctx_nr = perf_sw_context,
5168
5169    .event_init = perf_swevent_init,
5170    .add = perf_swevent_add,
5171    .del = perf_swevent_del,
5172    .start = perf_swevent_start,
5173    .stop = perf_swevent_stop,
5174    .read = perf_swevent_read,
5175
5176    .event_idx = perf_swevent_event_idx,
5177};
5178
5179#ifdef CONFIG_EVENT_TRACING
5180
5181static int perf_tp_filter_match(struct perf_event *event,
5182                struct perf_sample_data *data)
5183{
5184    void *record = data->raw->data;
5185
5186    if (likely(!event->filter) || filter_match_preds(event->filter, record))
5187        return 1;
5188    return 0;
5189}
5190
5191static int perf_tp_event_match(struct perf_event *event,
5192                struct perf_sample_data *data,
5193                struct pt_regs *regs)
5194{
5195    if (event->hw.state & PERF_HES_STOPPED)
5196        return 0;
5197    /*
5198     * All tracepoints are from kernel-space.
5199     */
5200    if (event->attr.exclude_kernel)
5201        return 0;
5202
5203    if (!perf_tp_filter_match(event, data))
5204        return 0;
5205
5206    return 1;
5207}
5208
5209void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5210           struct pt_regs *regs, struct hlist_head *head, int rctx)
5211{
5212    struct perf_sample_data data;
5213    struct perf_event *event;
5214    struct hlist_node *node;
5215
5216    struct perf_raw_record raw = {
5217        .size = entry_size,
5218        .data = record,
5219    };
5220
5221    perf_sample_data_init(&data, addr, 0);
5222    data.raw = &raw;
5223
5224    hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5225        if (perf_tp_event_match(event, &data, regs))
5226            perf_swevent_event(event, count, &data, regs);
5227    }
5228
5229    perf_swevent_put_recursion_context(rctx);
5230}
5231EXPORT_SYMBOL_GPL(perf_tp_event);
5232
5233static void tp_perf_event_destroy(struct perf_event *event)
5234{
5235    perf_trace_destroy(event);
5236}
5237
5238static int perf_tp_event_init(struct perf_event *event)
5239{
5240    int err;
5241
5242    if (event->attr.type != PERF_TYPE_TRACEPOINT)
5243        return -ENOENT;
5244
5245    /*
5246     * no branch sampling for tracepoint events
5247     */
5248    if (has_branch_stack(event))
5249        return -EOPNOTSUPP;
5250
5251    err = perf_trace_init(event);
5252    if (err)
5253        return err;
5254
5255    event->destroy = tp_perf_event_destroy;
5256
5257    return 0;
5258}
5259
5260static struct pmu perf_tracepoint = {
5261    .task_ctx_nr = perf_sw_context,
5262
5263    .event_init = perf_tp_event_init,
5264    .add = perf_trace_add,
5265    .del = perf_trace_del,
5266    .start = perf_swevent_start,
5267    .stop = perf_swevent_stop,
5268    .read = perf_swevent_read,
5269
5270    .event_idx = perf_swevent_event_idx,
5271};
5272
5273static inline void perf_tp_register(void)
5274{
5275    perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
5276}
5277
5278static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5279{
5280    char *filter_str;
5281    int ret;
5282
5283    if (event->attr.type != PERF_TYPE_TRACEPOINT)
5284        return -EINVAL;
5285
5286    filter_str = strndup_user(arg, PAGE_SIZE);
5287    if (IS_ERR(filter_str))
5288        return PTR_ERR(filter_str);
5289
5290    ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5291
5292    kfree(filter_str);
5293    return ret;
5294}
5295
5296static void perf_event_free_filter(struct perf_event *event)
5297{
5298    ftrace_profile_free_filter(event);
5299}
5300
5301#else
5302
5303static inline void perf_tp_register(void)
5304{
5305}
5306
5307static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5308{
5309    return -ENOENT;
5310}
5311
5312static void perf_event_free_filter(struct perf_event *event)
5313{
5314}
5315
5316#endif /* CONFIG_EVENT_TRACING */
5317
5318#ifdef CONFIG_HAVE_HW_BREAKPOINT
5319void perf_bp_event(struct perf_event *bp, void *data)
5320{
5321    struct perf_sample_data sample;
5322    struct pt_regs *regs = data;
5323
5324    perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
5325
5326    if (!bp->hw.state && !perf_exclude_event(bp, regs))
5327        perf_swevent_event(bp, 1, &sample, regs);
5328}
5329#endif
5330
5331/*
5332 * hrtimer based swevent callback
5333 */
5334
5335static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5336{
5337    enum hrtimer_restart ret = HRTIMER_RESTART;
5338    struct perf_sample_data data;
5339    struct pt_regs *regs;
5340    struct perf_event *event;
5341    u64 period;
5342
5343    event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5344
5345    if (event->state != PERF_EVENT_STATE_ACTIVE)
5346        return HRTIMER_NORESTART;
5347
5348    event->pmu->read(event);
5349
5350    perf_sample_data_init(&data, 0, event->hw.last_period);
5351    regs = get_irq_regs();
5352
5353    if (regs && !perf_exclude_event(event, regs)) {
5354        if (!(event->attr.exclude_idle && is_idle_task(current)))
5355            if (__perf_event_overflow(event, 1, &data, regs))
5356                ret = HRTIMER_NORESTART;
5357    }
5358
5359    period = max_t(u64, 10000, event->hw.sample_period);
5360    hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5361
5362    return ret;
5363}
5364
5365static void perf_swevent_start_hrtimer(struct perf_event *event)
5366{
5367    struct hw_perf_event *hwc = &event->hw;
5368    s64 period;
5369
5370    if (!is_sampling_event(event))
5371        return;
5372
5373    period = local64_read(&hwc->period_left);
5374    if (period) {
5375        if (period < 0)
5376            period = 10000;
5377
5378        local64_set(&hwc->period_left, 0);
5379    } else {
5380        period = max_t(u64, 10000, hwc->sample_period);
5381    }
5382    __hrtimer_start_range_ns(&hwc->hrtimer,
5383                ns_to_ktime(period), 0,
5384                HRTIMER_MODE_REL_PINNED, 0);
5385}
5386
5387static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5388{
5389    struct hw_perf_event *hwc = &event->hw;
5390
5391    if (is_sampling_event(event)) {
5392        ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
5393        local64_set(&hwc->period_left, ktime_to_ns(remaining));
5394
5395        hrtimer_cancel(&hwc->hrtimer);
5396    }
5397}
5398
5399static void perf_swevent_init_hrtimer(struct perf_event *event)
5400{
5401    struct hw_perf_event *hwc = &event->hw;
5402
5403    if (!is_sampling_event(event))
5404        return;
5405
5406    hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5407    hwc->hrtimer.function = perf_swevent_hrtimer;
5408
5409    /*
5410     * Since hrtimers have a fixed rate, we can do a static freq->period
5411     * mapping and avoid the whole period adjust feedback stuff.
5412     */
5413    if (event->attr.freq) {
5414        long freq = event->attr.sample_freq;
5415
5416        event->attr.sample_period = NSEC_PER_SEC / freq;
5417        hwc->sample_period = event->attr.sample_period;
5418        local64_set(&hwc->period_left, hwc->sample_period);
5419        event->attr.freq = 0;
5420    }
5421}
5422
5423/*
5424 * Software event: cpu wall time clock
5425 */
5426
5427static void cpu_clock_event_update(struct perf_event *event)
5428{
5429    s64 prev;
5430    u64 now;
5431
5432    now = local_clock();
5433    prev = local64_xchg(&event->hw.prev_count, now);
5434    local64_add(now - prev, &event->count);
5435}
5436
5437static void cpu_clock_event_start(struct perf_event *event, int flags)
5438{
5439    local64_set(&event->hw.prev_count, local_clock());
5440    perf_swevent_start_hrtimer(event);
5441}
5442
5443static void cpu_clock_event_stop(struct perf_event *event, int flags)
5444{
5445    perf_swevent_cancel_hrtimer(event);
5446    cpu_clock_event_update(event);
5447}
5448
5449static int cpu_clock_event_add(struct perf_event *event, int flags)
5450{
5451    if (flags & PERF_EF_START)
5452        cpu_clock_event_start(event, flags);
5453
5454    return 0;
5455}
5456
5457static void cpu_clock_event_del(struct perf_event *event, int flags)
5458{
5459    cpu_clock_event_stop(event, flags);
5460}
5461
5462static void cpu_clock_event_read(struct perf_event *event)
5463{
5464    cpu_clock_event_update(event);
5465}
5466
5467static int cpu_clock_event_init(struct perf_event *event)
5468{
5469    if (event->attr.type != PERF_TYPE_SOFTWARE)
5470        return -ENOENT;
5471
5472    if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5473        return -ENOENT;
5474
5475    /*
5476     * no branch sampling for software events
5477     */
5478    if (has_branch_stack(event))
5479        return -EOPNOTSUPP;
5480
5481    perf_swevent_init_hrtimer(event);
5482
5483    return 0;
5484}
5485
5486static struct pmu perf_cpu_clock = {
5487    .task_ctx_nr = perf_sw_context,
5488
5489    .event_init = cpu_clock_event_init,
5490    .add = cpu_clock_event_add,
5491    .del = cpu_clock_event_del,
5492    .start = cpu_clock_event_start,
5493    .stop = cpu_clock_event_stop,
5494    .read = cpu_clock_event_read,
5495
5496    .event_idx = perf_swevent_event_idx,
5497};
5498
5499/*
5500 * Software event: task time clock
5501 */
5502
5503static void task_clock_event_update(struct perf_event *event, u64 now)
5504{
5505    u64 prev;
5506    s64 delta;
5507
5508    prev = local64_xchg(&event->hw.prev_count, now);
5509    delta = now - prev;
5510    local64_add(delta, &event->count);
5511}
5512
5513static void task_clock_event_start(struct perf_event *event, int flags)
5514{
5515    local64_set(&event->hw.prev_count, event->ctx->time);
5516    perf_swevent_start_hrtimer(event);
5517}
5518
5519static void task_clock_event_stop(struct perf_event *event, int flags)
5520{
5521    perf_swevent_cancel_hrtimer(event);
5522    task_clock_event_update(event, event->ctx->time);
5523}
5524
5525static int task_clock_event_add(struct perf_event *event, int flags)
5526{
5527    if (flags & PERF_EF_START)
5528        task_clock_event_start(event, flags);
5529
5530    return 0;
5531}
5532
5533static void task_clock_event_del(struct perf_event *event, int flags)
5534{
5535    task_clock_event_stop(event, PERF_EF_UPDATE);
5536}
5537
5538static void task_clock_event_read(struct perf_event *event)
5539{
5540    u64 now = perf_clock();
5541    u64 delta = now - event->ctx->timestamp;
5542    u64 time = event->ctx->time + delta;
5543
5544    task_clock_event_update(event, time);
5545}
5546
5547static int task_clock_event_init(struct perf_event *event)
5548{
5549    if (event->attr.type != PERF_TYPE_SOFTWARE)
5550        return -ENOENT;
5551
5552    if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5553        return -ENOENT;
5554
5555    /*
5556     * no branch sampling for software events
5557     */
5558    if (has_branch_stack(event))
5559        return -EOPNOTSUPP;
5560
5561    perf_swevent_init_hrtimer(event);
5562
5563    return 0;
5564}
5565
5566static struct pmu perf_task_clock = {
5567    .task_ctx_nr = perf_sw_context,
5568
5569    .event_init = task_clock_event_init,
5570    .add = task_clock_event_add,
5571    .del = task_clock_event_del,
5572    .start = task_clock_event_start,
5573    .stop = task_clock_event_stop,
5574    .read = task_clock_event_read,
5575
5576    .event_idx = perf_swevent_event_idx,
5577};
5578
5579static void perf_pmu_nop_void(struct pmu *pmu)
5580{
5581}
5582
5583static int perf_pmu_nop_int(struct pmu *pmu)
5584{
5585    return 0;
5586}
5587
5588static void perf_pmu_start_txn(struct pmu *pmu)
5589{
5590    perf_pmu_disable(pmu);
5591}
5592
5593static int perf_pmu_commit_txn(struct pmu *pmu)
5594{
5595    perf_pmu_enable(pmu);
5596    return 0;
5597}
5598
5599static void perf_pmu_cancel_txn(struct pmu *pmu)
5600{
5601    perf_pmu_enable(pmu);
5602}
5603
5604static int perf_event_idx_default(struct perf_event *event)
5605{
5606    return event->hw.idx + 1;
5607}
5608
5609/*
5610 * Ensures all contexts with the same task_ctx_nr have the same
5611 * pmu_cpu_context too.
5612 */
5613static void *find_pmu_context(int ctxn)
5614{
5615    struct pmu *pmu;
5616
5617    if (ctxn < 0)
5618        return NULL;
5619
5620    list_for_each_entry(pmu, &pmus, entry) {
5621        if (pmu->task_ctx_nr == ctxn)
5622            return pmu->pmu_cpu_context;
5623    }
5624
5625    return NULL;
5626}
5627
5628static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5629{
5630    int cpu;
5631
5632    for_each_possible_cpu(cpu) {
5633        struct perf_cpu_context *cpuctx;
5634
5635        cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5636
5637        if (cpuctx->active_pmu == old_pmu)
5638            cpuctx->active_pmu = pmu;
5639    }
5640}
5641
5642static void free_pmu_context(struct pmu *pmu)
5643{
5644    struct pmu *i;
5645
5646    mutex_lock(&pmus_lock);
5647    /*
5648     * Like a real lame refcount.
5649     */
5650    list_for_each_entry(i, &pmus, entry) {
5651        if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5652            update_pmu_context(i, pmu);
5653            goto out;
5654        }
5655    }
5656
5657    free_percpu(pmu->pmu_cpu_context);
5658out:
5659    mutex_unlock(&pmus_lock);
5660}
5661static struct idr pmu_idr;
5662
5663static ssize_t
5664type_show(struct device *dev, struct device_attribute *attr, char *page)
5665{
5666    struct pmu *pmu = dev_get_drvdata(dev);
5667
5668    return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5669}
5670
5671static struct device_attribute pmu_dev_attrs[] = {
5672       __ATTR_RO(type),
5673       __ATTR_NULL,
5674};
5675
5676static int pmu_bus_running;
5677static struct bus_type pmu_bus = {
5678    .name = "event_source",
5679    .dev_attrs = pmu_dev_attrs,
5680};
5681
5682static void pmu_dev_release(struct device *dev)
5683{
5684    kfree(dev);
5685}
5686
5687static int pmu_dev_alloc(struct pmu *pmu)
5688{
5689    int ret = -ENOMEM;
5690
5691    pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5692    if (!pmu->dev)
5693        goto out;
5694
5695    pmu->dev->groups = pmu->attr_groups;
5696    device_initialize(pmu->dev);
5697    ret = dev_set_name(pmu->dev, "%s", pmu->name);
5698    if (ret)
5699        goto free_dev;
5700
5701    dev_set_drvdata(pmu->dev, pmu);
5702    pmu->dev->bus = &pmu_bus;
5703    pmu->dev->release = pmu_dev_release;
5704    ret = device_add(pmu->dev);
5705    if (ret)
5706        goto free_dev;
5707
5708out:
5709    return ret;
5710
5711free_dev:
5712    put_device(pmu->dev);
5713    goto out;
5714}
5715
5716static struct lock_class_key cpuctx_mutex;
5717static struct lock_class_key cpuctx_lock;
5718
5719int perf_pmu_register(struct pmu *pmu, char *name, int type)
5720{
5721    int cpu, ret;
5722
5723    mutex_lock(&pmus_lock);
5724    ret = -ENOMEM;
5725    pmu->pmu_disable_count = alloc_percpu(int);
5726    if (!pmu->pmu_disable_count)
5727        goto unlock;
5728
5729    pmu->type = -1;
5730    if (!name)
5731        goto skip_type;
5732    pmu->name = name;
5733
5734    if (type < 0) {
5735        int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5736        if (!err)
5737            goto free_pdc;
5738
5739        err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5740        if (err) {
5741            ret = err;
5742            goto free_pdc;
5743        }
5744    }
5745    pmu->type = type;
5746
5747    if (pmu_bus_running) {
5748        ret = pmu_dev_alloc(pmu);
5749        if (ret)
5750            goto free_idr;
5751    }
5752
5753skip_type:
5754    pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5755    if (pmu->pmu_cpu_context)
5756        goto got_cpu_context;
5757
5758    pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5759    if (!pmu->pmu_cpu_context)
5760        goto free_dev;
5761
5762    for_each_possible_cpu(cpu) {
5763        struct perf_cpu_context *cpuctx;
5764
5765        cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5766        __perf_event_init_context(&cpuctx->ctx);
5767        lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5768        lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
5769        cpuctx->ctx.type = cpu_context;
5770        cpuctx->ctx.pmu = pmu;
5771        cpuctx->jiffies_interval = 1;
5772        INIT_LIST_HEAD(&cpuctx->rotation_list);
5773        cpuctx->active_pmu = pmu;
5774    }
5775
5776got_cpu_context:
5777    if (!pmu->start_txn) {
5778        if (pmu->pmu_enable) {
5779            /*
5780             * If we have pmu_enable/pmu_disable calls, install
5781             * transaction stubs that use that to try and batch
5782             * hardware accesses.
5783             */
5784            pmu->start_txn = perf_pmu_start_txn;
5785            pmu->commit_txn = perf_pmu_commit_txn;
5786            pmu->cancel_txn = perf_pmu_cancel_txn;
5787        } else {
5788            pmu->start_txn = perf_pmu_nop_void;
5789            pmu->commit_txn = perf_pmu_nop_int;
5790            pmu->cancel_txn = perf_pmu_nop_void;
5791        }
5792    }
5793
5794    if (!pmu->pmu_enable) {
5795        pmu->pmu_enable = perf_pmu_nop_void;
5796        pmu->pmu_disable = perf_pmu_nop_void;
5797    }
5798
5799    if (!pmu->event_idx)
5800        pmu->event_idx = perf_event_idx_default;
5801
5802    list_add_rcu(&pmu->entry, &pmus);
5803    ret = 0;
5804unlock:
5805    mutex_unlock(&pmus_lock);
5806
5807    return ret;
5808
5809free_dev:
5810    device_del(pmu->dev);
5811    put_device(pmu->dev);
5812
5813free_idr:
5814    if (pmu->type >= PERF_TYPE_MAX)
5815        idr_remove(&pmu_idr, pmu->type);
5816
5817free_pdc:
5818    free_percpu(pmu->pmu_disable_count);
5819    goto unlock;
5820}
5821
5822void perf_pmu_unregister(struct pmu *pmu)
5823{
5824    mutex_lock(&pmus_lock);
5825    list_del_rcu(&pmu->entry);
5826    mutex_unlock(&pmus_lock);
5827
5828    /*
5829     * We dereference the pmu list under both SRCU and regular RCU, so
5830     * synchronize against both of those.
5831     */
5832    synchronize_srcu(&pmus_srcu);
5833    synchronize_rcu();
5834
5835    free_percpu(pmu->pmu_disable_count);
5836    if (pmu->type >= PERF_TYPE_MAX)
5837        idr_remove(&pmu_idr, pmu->type);
5838    device_del(pmu->dev);
5839    put_device(pmu->dev);
5840    free_pmu_context(pmu);
5841}
5842
5843struct pmu *perf_init_event(struct perf_event *event)
5844{
5845    struct pmu *pmu = NULL;
5846    int idx;
5847    int ret;
5848
5849    idx = srcu_read_lock(&pmus_srcu);
5850
5851    rcu_read_lock();
5852    pmu = idr_find(&pmu_idr, event->attr.type);
5853    rcu_read_unlock();
5854    if (pmu) {
5855        event->pmu = pmu;
5856        ret = pmu->event_init(event);
5857        if (ret)
5858            pmu = ERR_PTR(ret);
5859        goto unlock;
5860    }
5861
5862    list_for_each_entry_rcu(pmu, &pmus, entry) {
5863        event->pmu = pmu;
5864        ret = pmu->event_init(event);
5865        if (!ret)
5866            goto unlock;
5867
5868        if (ret != -ENOENT) {
5869            pmu = ERR_PTR(ret);
5870            goto unlock;
5871        }
5872    }
5873    pmu = ERR_PTR(-ENOENT);
5874unlock:
5875    srcu_read_unlock(&pmus_srcu, idx);
5876
5877    return pmu;
5878}
5879
5880/*
5881 * Allocate and initialize a event structure
5882 */
5883static struct perf_event *
5884perf_event_alloc(struct perf_event_attr *attr, int cpu,
5885         struct task_struct *task,
5886         struct perf_event *group_leader,
5887         struct perf_event *parent_event,
5888         perf_overflow_handler_t overflow_handler,
5889         void *context)
5890{
5891    struct pmu *pmu;
5892    struct perf_event *event;
5893    struct hw_perf_event *hwc;
5894    long err;
5895
5896    if ((unsigned)cpu >= nr_cpu_ids) {
5897        if (!task || cpu != -1)
5898            return ERR_PTR(-EINVAL);
5899    }
5900
5901    event = kzalloc(sizeof(*event), GFP_KERNEL);
5902    if (!event)
5903        return ERR_PTR(-ENOMEM);
5904
5905    /*
5906     * Single events are their own group leaders, with an
5907     * empty sibling list:
5908     */
5909    if (!group_leader)
5910        group_leader = event;
5911
5912    mutex_init(&event->child_mutex);
5913    INIT_LIST_HEAD(&event->child_list);
5914
5915    INIT_LIST_HEAD(&event->group_entry);
5916    INIT_LIST_HEAD(&event->event_entry);
5917    INIT_LIST_HEAD(&event->sibling_list);
5918    INIT_LIST_HEAD(&event->rb_entry);
5919
5920    init_waitqueue_head(&event->waitq);
5921    init_irq_work(&event->pending, perf_pending_event);
5922
5923    mutex_init(&event->mmap_mutex);
5924
5925    event->cpu = cpu;
5926    event->attr = *attr;
5927    event->group_leader = group_leader;
5928    event->pmu = NULL;
5929    event->oncpu = -1;
5930
5931    event->parent = parent_event;
5932
5933    event->ns = get_pid_ns(current->nsproxy->pid_ns);
5934    event->id = atomic64_inc_return(&perf_event_id);
5935
5936    event->state = PERF_EVENT_STATE_INACTIVE;
5937
5938    if (task) {
5939        event->attach_state = PERF_ATTACH_TASK;
5940#ifdef CONFIG_HAVE_HW_BREAKPOINT
5941        /*
5942         * hw_breakpoint is a bit difficult here..
5943         */
5944        if (attr->type == PERF_TYPE_BREAKPOINT)
5945            event->hw.bp_target = task;
5946#endif
5947    }
5948
5949    if (!overflow_handler && parent_event) {
5950        overflow_handler = parent_event->overflow_handler;
5951        context = parent_event->overflow_handler_context;
5952    }
5953
5954    event->overflow_handler = overflow_handler;
5955    event->overflow_handler_context = context;
5956
5957    if (attr->disabled)
5958        event->state = PERF_EVENT_STATE_OFF;
5959
5960    pmu = NULL;
5961
5962    hwc = &event->hw;
5963    hwc->sample_period = attr->sample_period;
5964    if (attr->freq && attr->sample_freq)
5965        hwc->sample_period = 1;
5966    hwc->last_period = hwc->sample_period;
5967
5968    local64_set(&hwc->period_left, hwc->sample_period);
5969
5970    /*
5971     * we currently do not support PERF_FORMAT_GROUP on inherited events
5972     */
5973    if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
5974        goto done;
5975
5976    pmu = perf_init_event(event);
5977
5978done:
5979    err = 0;
5980    if (!pmu)
5981        err = -EINVAL;
5982    else if (IS_ERR(pmu))
5983        err = PTR_ERR(pmu);
5984
5985    if (err) {
5986        if (event->ns)
5987            put_pid_ns(event->ns);
5988        kfree(event);
5989        return ERR_PTR(err);
5990    }
5991
5992    if (!event->parent) {
5993        if (event->attach_state & PERF_ATTACH_TASK)
5994            static_key_slow_inc(&perf_sched_events.key);
5995        if (event->attr.mmap || event->attr.mmap_data)
5996            atomic_inc(&nr_mmap_events);
5997        if (event->attr.comm)
5998            atomic_inc(&nr_comm_events);
5999        if (event->attr.task)
6000            atomic_inc(&nr_task_events);
6001        if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6002            err = get_callchain_buffers();
6003            if (err) {
6004                free_event(event);
6005                return ERR_PTR(err);
6006            }
6007        }
6008        if (has_branch_stack(event)) {
6009            static_key_slow_inc(&perf_sched_events.key);
6010            if (!(event->attach_state & PERF_ATTACH_TASK))
6011                atomic_inc(&per_cpu(perf_branch_stack_events,
6012                            event->cpu));
6013        }
6014    }
6015
6016    return event;
6017}
6018
6019static int perf_copy_attr(struct perf_event_attr __user *uattr,
6020              struct perf_event_attr *attr)
6021{
6022    u32 size;
6023    int ret;
6024
6025    if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
6026        return -EFAULT;
6027
6028    /*
6029     * zero the full structure, so that a short copy will be nice.
6030     */
6031    memset(attr, 0, sizeof(*attr));
6032
6033    ret = get_user(size, &uattr->size);
6034    if (ret)
6035        return ret;
6036
6037    if (size > PAGE_SIZE) /* silly large */
6038        goto err_size;
6039
6040    if (!size) /* abi compat */
6041        size = PERF_ATTR_SIZE_VER0;
6042
6043    if (size < PERF_ATTR_SIZE_VER0)
6044        goto err_size;
6045
6046    /*
6047     * If we're handed a bigger struct than we know of,
6048     * ensure all the unknown bits are 0 - i.e. new
6049     * user-space does not rely on any kernel feature
6050     * extensions we dont know about yet.
6051     */
6052    if (size > sizeof(*attr)) {
6053        unsigned char __user *addr;
6054        unsigned char __user *end;
6055        unsigned char val;
6056
6057        addr = (void __user *)uattr + sizeof(*attr);
6058        end = (void __user *)uattr + size;
6059
6060        for (; addr < end; addr++) {
6061            ret = get_user(val, addr);
6062            if (ret)
6063                return ret;
6064            if (val)
6065                goto err_size;
6066        }
6067        size = sizeof(*attr);
6068    }
6069
6070    ret = copy_from_user(attr, uattr, size);
6071    if (ret)
6072        return -EFAULT;
6073
6074    if (attr->__reserved_1)
6075        return -EINVAL;
6076
6077    if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
6078        return -EINVAL;
6079
6080    if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6081        return -EINVAL;
6082
6083    if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6084        u64 mask = attr->branch_sample_type;
6085
6086        /* only using defined bits */
6087        if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6088            return -EINVAL;
6089
6090        /* at least one branch bit must be set */
6091        if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6092            return -EINVAL;
6093
6094        /* kernel level capture: check permissions */
6095        if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6096            && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6097            return -EACCES;
6098
6099        /* propagate priv level, when not set for branch */
6100        if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6101
6102            /* exclude_kernel checked on syscall entry */
6103            if (!attr->exclude_kernel)
6104                mask |= PERF_SAMPLE_BRANCH_KERNEL;
6105
6106            if (!attr->exclude_user)
6107                mask |= PERF_SAMPLE_BRANCH_USER;
6108
6109            if (!attr->exclude_hv)
6110                mask |= PERF_SAMPLE_BRANCH_HV;
6111            /*
6112             * adjust user setting (for HW filter setup)
6113             */
6114            attr->branch_sample_type = mask;
6115        }
6116    }
6117out:
6118    return ret;
6119
6120err_size:
6121    put_user(sizeof(*attr), &uattr->size);
6122    ret = -E2BIG;
6123    goto out;
6124}
6125
6126static int
6127perf_event_set_output(struct perf_event *event, struct perf_event *output_event)