Root/
1 | /* |
2 | * Performance events core code: |
3 | * |
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
8 | * |
9 | * For licensing details see kernel-base/COPYING |
10 | */ |
11 | |
12 | #include <linux/fs.h> |
13 | #include <linux/mm.h> |
14 | #include <linux/cpu.h> |
15 | #include <linux/smp.h> |
16 | #include <linux/file.h> |
17 | #include <linux/poll.h> |
18 | #include <linux/slab.h> |
19 | #include <linux/hash.h> |
20 | #include <linux/sysfs.h> |
21 | #include <linux/dcache.h> |
22 | #include <linux/percpu.h> |
23 | #include <linux/ptrace.h> |
24 | #include <linux/vmstat.h> |
25 | #include <linux/vmalloc.h> |
26 | #include <linux/hardirq.h> |
27 | #include <linux/rculist.h> |
28 | #include <linux/uaccess.h> |
29 | #include <linux/syscalls.h> |
30 | #include <linux/anon_inodes.h> |
31 | #include <linux/kernel_stat.h> |
32 | #include <linux/perf_event.h> |
33 | #include <linux/ftrace_event.h> |
34 | #include <linux/hw_breakpoint.h> |
35 | |
36 | #include <asm/irq_regs.h> |
37 | |
38 | /* |
39 | * Each CPU has a list of per CPU events: |
40 | */ |
41 | static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); |
42 | |
43 | int perf_max_events __read_mostly = 1; |
44 | static int perf_reserved_percpu __read_mostly; |
45 | static int perf_overcommit __read_mostly = 1; |
46 | |
47 | static atomic_t nr_events __read_mostly; |
48 | static atomic_t nr_mmap_events __read_mostly; |
49 | static atomic_t nr_comm_events __read_mostly; |
50 | static atomic_t nr_task_events __read_mostly; |
51 | |
52 | /* |
53 | * perf event paranoia level: |
54 | * -1 - not paranoid at all |
55 | * 0 - disallow raw tracepoint access for unpriv |
56 | * 1 - disallow cpu events for unpriv |
57 | * 2 - disallow kernel profiling for unpriv |
58 | */ |
59 | int sysctl_perf_event_paranoid __read_mostly = 1; |
60 | |
61 | int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ |
62 | |
63 | /* |
64 | * max perf event sample rate |
65 | */ |
66 | int sysctl_perf_event_sample_rate __read_mostly = 100000; |
67 | |
68 | static atomic64_t perf_event_id; |
69 | |
70 | /* |
71 | * Lock for (sysadmin-configurable) event reservations: |
72 | */ |
73 | static DEFINE_SPINLOCK(perf_resource_lock); |
74 | |
75 | /* |
76 | * Architecture provided APIs - weak aliases: |
77 | */ |
78 | extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) |
79 | { |
80 | return NULL; |
81 | } |
82 | |
83 | void __weak hw_perf_disable(void) { barrier(); } |
84 | void __weak hw_perf_enable(void) { barrier(); } |
85 | |
86 | void __weak perf_event_print_debug(void) { } |
87 | |
88 | static DEFINE_PER_CPU(int, perf_disable_count); |
89 | |
90 | void perf_disable(void) |
91 | { |
92 | if (!__get_cpu_var(perf_disable_count)++) |
93 | hw_perf_disable(); |
94 | } |
95 | |
96 | void perf_enable(void) |
97 | { |
98 | if (!--__get_cpu_var(perf_disable_count)) |
99 | hw_perf_enable(); |
100 | } |
101 | |
102 | static void get_ctx(struct perf_event_context *ctx) |
103 | { |
104 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); |
105 | } |
106 | |
107 | static void free_ctx(struct rcu_head *head) |
108 | { |
109 | struct perf_event_context *ctx; |
110 | |
111 | ctx = container_of(head, struct perf_event_context, rcu_head); |
112 | kfree(ctx); |
113 | } |
114 | |
115 | static void put_ctx(struct perf_event_context *ctx) |
116 | { |
117 | if (atomic_dec_and_test(&ctx->refcount)) { |
118 | if (ctx->parent_ctx) |
119 | put_ctx(ctx->parent_ctx); |
120 | if (ctx->task) |
121 | put_task_struct(ctx->task); |
122 | call_rcu(&ctx->rcu_head, free_ctx); |
123 | } |
124 | } |
125 | |
126 | static void unclone_ctx(struct perf_event_context *ctx) |
127 | { |
128 | if (ctx->parent_ctx) { |
129 | put_ctx(ctx->parent_ctx); |
130 | ctx->parent_ctx = NULL; |
131 | } |
132 | } |
133 | |
134 | /* |
135 | * If we inherit events we want to return the parent event id |
136 | * to userspace. |
137 | */ |
138 | static u64 primary_event_id(struct perf_event *event) |
139 | { |
140 | u64 id = event->id; |
141 | |
142 | if (event->parent) |
143 | id = event->parent->id; |
144 | |
145 | return id; |
146 | } |
147 | |
148 | /* |
149 | * Get the perf_event_context for a task and lock it. |
150 | * This has to cope with with the fact that until it is locked, |
151 | * the context could get moved to another task. |
152 | */ |
153 | static struct perf_event_context * |
154 | perf_lock_task_context(struct task_struct *task, unsigned long *flags) |
155 | { |
156 | struct perf_event_context *ctx; |
157 | |
158 | rcu_read_lock(); |
159 | retry: |
160 | ctx = rcu_dereference(task->perf_event_ctxp); |
161 | if (ctx) { |
162 | /* |
163 | * If this context is a clone of another, it might |
164 | * get swapped for another underneath us by |
165 | * perf_event_task_sched_out, though the |
166 | * rcu_read_lock() protects us from any context |
167 | * getting freed. Lock the context and check if it |
168 | * got swapped before we could get the lock, and retry |
169 | * if so. If we locked the right context, then it |
170 | * can't get swapped on us any more. |
171 | */ |
172 | raw_spin_lock_irqsave(&ctx->lock, *flags); |
173 | if (ctx != rcu_dereference(task->perf_event_ctxp)) { |
174 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); |
175 | goto retry; |
176 | } |
177 | |
178 | if (!atomic_inc_not_zero(&ctx->refcount)) { |
179 | raw_spin_unlock_irqrestore(&ctx->lock, *flags); |
180 | ctx = NULL; |
181 | } |
182 | } |
183 | rcu_read_unlock(); |
184 | return ctx; |
185 | } |
186 | |
187 | /* |
188 | * Get the context for a task and increment its pin_count so it |
189 | * can't get swapped to another task. This also increments its |
190 | * reference count so that the context can't get freed. |
191 | */ |
192 | static struct perf_event_context *perf_pin_task_context(struct task_struct *task) |
193 | { |
194 | struct perf_event_context *ctx; |
195 | unsigned long flags; |
196 | |
197 | ctx = perf_lock_task_context(task, &flags); |
198 | if (ctx) { |
199 | ++ctx->pin_count; |
200 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
201 | } |
202 | return ctx; |
203 | } |
204 | |
205 | static void perf_unpin_context(struct perf_event_context *ctx) |
206 | { |
207 | unsigned long flags; |
208 | |
209 | raw_spin_lock_irqsave(&ctx->lock, flags); |
210 | --ctx->pin_count; |
211 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
212 | put_ctx(ctx); |
213 | } |
214 | |
215 | static inline u64 perf_clock(void) |
216 | { |
217 | return cpu_clock(raw_smp_processor_id()); |
218 | } |
219 | |
220 | /* |
221 | * Update the record of the current time in a context. |
222 | */ |
223 | static void update_context_time(struct perf_event_context *ctx) |
224 | { |
225 | u64 now = perf_clock(); |
226 | |
227 | ctx->time += now - ctx->timestamp; |
228 | ctx->timestamp = now; |
229 | } |
230 | |
231 | /* |
232 | * Update the total_time_enabled and total_time_running fields for a event. |
233 | */ |
234 | static void update_event_times(struct perf_event *event) |
235 | { |
236 | struct perf_event_context *ctx = event->ctx; |
237 | u64 run_end; |
238 | |
239 | if (event->state < PERF_EVENT_STATE_INACTIVE || |
240 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) |
241 | return; |
242 | |
243 | if (ctx->is_active) |
244 | run_end = ctx->time; |
245 | else |
246 | run_end = event->tstamp_stopped; |
247 | |
248 | event->total_time_enabled = run_end - event->tstamp_enabled; |
249 | |
250 | if (event->state == PERF_EVENT_STATE_INACTIVE) |
251 | run_end = event->tstamp_stopped; |
252 | else |
253 | run_end = ctx->time; |
254 | |
255 | event->total_time_running = run_end - event->tstamp_running; |
256 | } |
257 | |
258 | /* |
259 | * Update total_time_enabled and total_time_running for all events in a group. |
260 | */ |
261 | static void update_group_times(struct perf_event *leader) |
262 | { |
263 | struct perf_event *event; |
264 | |
265 | update_event_times(leader); |
266 | list_for_each_entry(event, &leader->sibling_list, group_entry) |
267 | update_event_times(event); |
268 | } |
269 | |
270 | static struct list_head * |
271 | ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) |
272 | { |
273 | if (event->attr.pinned) |
274 | return &ctx->pinned_groups; |
275 | else |
276 | return &ctx->flexible_groups; |
277 | } |
278 | |
279 | /* |
280 | * Add a event from the lists for its context. |
281 | * Must be called with ctx->mutex and ctx->lock held. |
282 | */ |
283 | static void |
284 | list_add_event(struct perf_event *event, struct perf_event_context *ctx) |
285 | { |
286 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); |
287 | event->attach_state |= PERF_ATTACH_CONTEXT; |
288 | |
289 | /* |
290 | * If we're a stand alone event or group leader, we go to the context |
291 | * list, group events are kept attached to the group so that |
292 | * perf_group_detach can, at all times, locate all siblings. |
293 | */ |
294 | if (event->group_leader == event) { |
295 | struct list_head *list; |
296 | |
297 | if (is_software_event(event)) |
298 | event->group_flags |= PERF_GROUP_SOFTWARE; |
299 | |
300 | list = ctx_group_list(event, ctx); |
301 | list_add_tail(&event->group_entry, list); |
302 | } |
303 | |
304 | list_add_rcu(&event->event_entry, &ctx->event_list); |
305 | ctx->nr_events++; |
306 | if (event->attr.inherit_stat) |
307 | ctx->nr_stat++; |
308 | } |
309 | |
310 | static void perf_group_attach(struct perf_event *event) |
311 | { |
312 | struct perf_event *group_leader = event->group_leader; |
313 | |
314 | WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); |
315 | event->attach_state |= PERF_ATTACH_GROUP; |
316 | |
317 | if (group_leader == event) |
318 | return; |
319 | |
320 | if (group_leader->group_flags & PERF_GROUP_SOFTWARE && |
321 | !is_software_event(event)) |
322 | group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; |
323 | |
324 | list_add_tail(&event->group_entry, &group_leader->sibling_list); |
325 | group_leader->nr_siblings++; |
326 | } |
327 | |
328 | /* |
329 | * Remove a event from the lists for its context. |
330 | * Must be called with ctx->mutex and ctx->lock held. |
331 | */ |
332 | static void |
333 | list_del_event(struct perf_event *event, struct perf_event_context *ctx) |
334 | { |
335 | /* |
336 | * We can have double detach due to exit/hot-unplug + close. |
337 | */ |
338 | if (!(event->attach_state & PERF_ATTACH_CONTEXT)) |
339 | return; |
340 | |
341 | event->attach_state &= ~PERF_ATTACH_CONTEXT; |
342 | |
343 | ctx->nr_events--; |
344 | if (event->attr.inherit_stat) |
345 | ctx->nr_stat--; |
346 | |
347 | list_del_rcu(&event->event_entry); |
348 | |
349 | if (event->group_leader == event) |
350 | list_del_init(&event->group_entry); |
351 | |
352 | update_group_times(event); |
353 | |
354 | /* |
355 | * If event was in error state, then keep it |
356 | * that way, otherwise bogus counts will be |
357 | * returned on read(). The only way to get out |
358 | * of error state is by explicit re-enabling |
359 | * of the event |
360 | */ |
361 | if (event->state > PERF_EVENT_STATE_OFF) |
362 | event->state = PERF_EVENT_STATE_OFF; |
363 | } |
364 | |
365 | static void perf_group_detach(struct perf_event *event) |
366 | { |
367 | struct perf_event *sibling, *tmp; |
368 | struct list_head *list = NULL; |
369 | |
370 | /* |
371 | * We can have double detach due to exit/hot-unplug + close. |
372 | */ |
373 | if (!(event->attach_state & PERF_ATTACH_GROUP)) |
374 | return; |
375 | |
376 | event->attach_state &= ~PERF_ATTACH_GROUP; |
377 | |
378 | /* |
379 | * If this is a sibling, remove it from its group. |
380 | */ |
381 | if (event->group_leader != event) { |
382 | list_del_init(&event->group_entry); |
383 | event->group_leader->nr_siblings--; |
384 | return; |
385 | } |
386 | |
387 | if (!list_empty(&event->group_entry)) |
388 | list = &event->group_entry; |
389 | |
390 | /* |
391 | * If this was a group event with sibling events then |
392 | * upgrade the siblings to singleton events by adding them |
393 | * to whatever list we are on. |
394 | */ |
395 | list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { |
396 | if (list) |
397 | list_move_tail(&sibling->group_entry, list); |
398 | sibling->group_leader = sibling; |
399 | |
400 | /* Inherit group flags from the previous leader */ |
401 | sibling->group_flags = event->group_flags; |
402 | } |
403 | } |
404 | |
405 | static void |
406 | event_sched_out(struct perf_event *event, |
407 | struct perf_cpu_context *cpuctx, |
408 | struct perf_event_context *ctx) |
409 | { |
410 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
411 | return; |
412 | |
413 | event->state = PERF_EVENT_STATE_INACTIVE; |
414 | if (event->pending_disable) { |
415 | event->pending_disable = 0; |
416 | event->state = PERF_EVENT_STATE_OFF; |
417 | } |
418 | event->tstamp_stopped = ctx->time; |
419 | event->pmu->disable(event); |
420 | event->oncpu = -1; |
421 | |
422 | if (!is_software_event(event)) |
423 | cpuctx->active_oncpu--; |
424 | ctx->nr_active--; |
425 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
426 | cpuctx->exclusive = 0; |
427 | } |
428 | |
429 | static void |
430 | group_sched_out(struct perf_event *group_event, |
431 | struct perf_cpu_context *cpuctx, |
432 | struct perf_event_context *ctx) |
433 | { |
434 | struct perf_event *event; |
435 | |
436 | if (group_event->state != PERF_EVENT_STATE_ACTIVE) |
437 | return; |
438 | |
439 | event_sched_out(group_event, cpuctx, ctx); |
440 | |
441 | /* |
442 | * Schedule out siblings (if any): |
443 | */ |
444 | list_for_each_entry(event, &group_event->sibling_list, group_entry) |
445 | event_sched_out(event, cpuctx, ctx); |
446 | |
447 | if (group_event->attr.exclusive) |
448 | cpuctx->exclusive = 0; |
449 | } |
450 | |
451 | /* |
452 | * Cross CPU call to remove a performance event |
453 | * |
454 | * We disable the event on the hardware level first. After that we |
455 | * remove it from the context list. |
456 | */ |
457 | static void __perf_event_remove_from_context(void *info) |
458 | { |
459 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
460 | struct perf_event *event = info; |
461 | struct perf_event_context *ctx = event->ctx; |
462 | |
463 | /* |
464 | * If this is a task context, we need to check whether it is |
465 | * the current task context of this cpu. If not it has been |
466 | * scheduled out before the smp call arrived. |
467 | */ |
468 | if (ctx->task && cpuctx->task_ctx != ctx) |
469 | return; |
470 | |
471 | raw_spin_lock(&ctx->lock); |
472 | /* |
473 | * Protect the list operation against NMI by disabling the |
474 | * events on a global level. |
475 | */ |
476 | perf_disable(); |
477 | |
478 | event_sched_out(event, cpuctx, ctx); |
479 | |
480 | list_del_event(event, ctx); |
481 | |
482 | if (!ctx->task) { |
483 | /* |
484 | * Allow more per task events with respect to the |
485 | * reservation: |
486 | */ |
487 | cpuctx->max_pertask = |
488 | min(perf_max_events - ctx->nr_events, |
489 | perf_max_events - perf_reserved_percpu); |
490 | } |
491 | |
492 | perf_enable(); |
493 | raw_spin_unlock(&ctx->lock); |
494 | } |
495 | |
496 | |
497 | /* |
498 | * Remove the event from a task's (or a CPU's) list of events. |
499 | * |
500 | * Must be called with ctx->mutex held. |
501 | * |
502 | * CPU events are removed with a smp call. For task events we only |
503 | * call when the task is on a CPU. |
504 | * |
505 | * If event->ctx is a cloned context, callers must make sure that |
506 | * every task struct that event->ctx->task could possibly point to |
507 | * remains valid. This is OK when called from perf_release since |
508 | * that only calls us on the top-level context, which can't be a clone. |
509 | * When called from perf_event_exit_task, it's OK because the |
510 | * context has been detached from its task. |
511 | */ |
512 | static void perf_event_remove_from_context(struct perf_event *event) |
513 | { |
514 | struct perf_event_context *ctx = event->ctx; |
515 | struct task_struct *task = ctx->task; |
516 | |
517 | if (!task) { |
518 | /* |
519 | * Per cpu events are removed via an smp call and |
520 | * the removal is always successful. |
521 | */ |
522 | smp_call_function_single(event->cpu, |
523 | __perf_event_remove_from_context, |
524 | event, 1); |
525 | return; |
526 | } |
527 | |
528 | retry: |
529 | task_oncpu_function_call(task, __perf_event_remove_from_context, |
530 | event); |
531 | |
532 | raw_spin_lock_irq(&ctx->lock); |
533 | /* |
534 | * If the context is active we need to retry the smp call. |
535 | */ |
536 | if (ctx->nr_active && !list_empty(&event->group_entry)) { |
537 | raw_spin_unlock_irq(&ctx->lock); |
538 | goto retry; |
539 | } |
540 | |
541 | /* |
542 | * The lock prevents that this context is scheduled in so we |
543 | * can remove the event safely, if the call above did not |
544 | * succeed. |
545 | */ |
546 | if (!list_empty(&event->group_entry)) |
547 | list_del_event(event, ctx); |
548 | raw_spin_unlock_irq(&ctx->lock); |
549 | } |
550 | |
551 | /* |
552 | * Cross CPU call to disable a performance event |
553 | */ |
554 | static void __perf_event_disable(void *info) |
555 | { |
556 | struct perf_event *event = info; |
557 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
558 | struct perf_event_context *ctx = event->ctx; |
559 | |
560 | /* |
561 | * If this is a per-task event, need to check whether this |
562 | * event's task is the current task on this cpu. |
563 | */ |
564 | if (ctx->task && cpuctx->task_ctx != ctx) |
565 | return; |
566 | |
567 | raw_spin_lock(&ctx->lock); |
568 | |
569 | /* |
570 | * If the event is on, turn it off. |
571 | * If it is in error state, leave it in error state. |
572 | */ |
573 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { |
574 | update_context_time(ctx); |
575 | update_group_times(event); |
576 | if (event == event->group_leader) |
577 | group_sched_out(event, cpuctx, ctx); |
578 | else |
579 | event_sched_out(event, cpuctx, ctx); |
580 | event->state = PERF_EVENT_STATE_OFF; |
581 | } |
582 | |
583 | raw_spin_unlock(&ctx->lock); |
584 | } |
585 | |
586 | /* |
587 | * Disable a event. |
588 | * |
589 | * If event->ctx is a cloned context, callers must make sure that |
590 | * every task struct that event->ctx->task could possibly point to |
591 | * remains valid. This condition is satisifed when called through |
592 | * perf_event_for_each_child or perf_event_for_each because they |
593 | * hold the top-level event's child_mutex, so any descendant that |
594 | * goes to exit will block in sync_child_event. |
595 | * When called from perf_pending_event it's OK because event->ctx |
596 | * is the current context on this CPU and preemption is disabled, |
597 | * hence we can't get into perf_event_task_sched_out for this context. |
598 | */ |
599 | void perf_event_disable(struct perf_event *event) |
600 | { |
601 | struct perf_event_context *ctx = event->ctx; |
602 | struct task_struct *task = ctx->task; |
603 | |
604 | if (!task) { |
605 | /* |
606 | * Disable the event on the cpu that it's on |
607 | */ |
608 | smp_call_function_single(event->cpu, __perf_event_disable, |
609 | event, 1); |
610 | return; |
611 | } |
612 | |
613 | retry: |
614 | task_oncpu_function_call(task, __perf_event_disable, event); |
615 | |
616 | raw_spin_lock_irq(&ctx->lock); |
617 | /* |
618 | * If the event is still active, we need to retry the cross-call. |
619 | */ |
620 | if (event->state == PERF_EVENT_STATE_ACTIVE) { |
621 | raw_spin_unlock_irq(&ctx->lock); |
622 | goto retry; |
623 | } |
624 | |
625 | /* |
626 | * Since we have the lock this context can't be scheduled |
627 | * in, so we can change the state safely. |
628 | */ |
629 | if (event->state == PERF_EVENT_STATE_INACTIVE) { |
630 | update_group_times(event); |
631 | event->state = PERF_EVENT_STATE_OFF; |
632 | } |
633 | |
634 | raw_spin_unlock_irq(&ctx->lock); |
635 | } |
636 | |
637 | static int |
638 | event_sched_in(struct perf_event *event, |
639 | struct perf_cpu_context *cpuctx, |
640 | struct perf_event_context *ctx) |
641 | { |
642 | if (event->state <= PERF_EVENT_STATE_OFF) |
643 | return 0; |
644 | |
645 | event->state = PERF_EVENT_STATE_ACTIVE; |
646 | event->oncpu = smp_processor_id(); |
647 | /* |
648 | * The new state must be visible before we turn it on in the hardware: |
649 | */ |
650 | smp_wmb(); |
651 | |
652 | if (event->pmu->enable(event)) { |
653 | event->state = PERF_EVENT_STATE_INACTIVE; |
654 | event->oncpu = -1; |
655 | return -EAGAIN; |
656 | } |
657 | |
658 | event->tstamp_running += ctx->time - event->tstamp_stopped; |
659 | |
660 | if (!is_software_event(event)) |
661 | cpuctx->active_oncpu++; |
662 | ctx->nr_active++; |
663 | |
664 | if (event->attr.exclusive) |
665 | cpuctx->exclusive = 1; |
666 | |
667 | return 0; |
668 | } |
669 | |
670 | static int |
671 | group_sched_in(struct perf_event *group_event, |
672 | struct perf_cpu_context *cpuctx, |
673 | struct perf_event_context *ctx) |
674 | { |
675 | struct perf_event *event, *partial_group = NULL; |
676 | const struct pmu *pmu = group_event->pmu; |
677 | bool txn = false; |
678 | int ret; |
679 | |
680 | if (group_event->state == PERF_EVENT_STATE_OFF) |
681 | return 0; |
682 | |
683 | /* Check if group transaction availabe */ |
684 | if (pmu->start_txn) |
685 | txn = true; |
686 | |
687 | if (txn) |
688 | pmu->start_txn(pmu); |
689 | |
690 | if (event_sched_in(group_event, cpuctx, ctx)) { |
691 | if (txn) |
692 | pmu->cancel_txn(pmu); |
693 | return -EAGAIN; |
694 | } |
695 | |
696 | /* |
697 | * Schedule in siblings as one group (if any): |
698 | */ |
699 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
700 | if (event_sched_in(event, cpuctx, ctx)) { |
701 | partial_group = event; |
702 | goto group_error; |
703 | } |
704 | } |
705 | |
706 | if (!txn) |
707 | return 0; |
708 | |
709 | ret = pmu->commit_txn(pmu); |
710 | if (!ret) { |
711 | pmu->cancel_txn(pmu); |
712 | return 0; |
713 | } |
714 | |
715 | group_error: |
716 | /* |
717 | * Groups can be scheduled in as one unit only, so undo any |
718 | * partial group before returning: |
719 | */ |
720 | list_for_each_entry(event, &group_event->sibling_list, group_entry) { |
721 | if (event == partial_group) |
722 | break; |
723 | event_sched_out(event, cpuctx, ctx); |
724 | } |
725 | event_sched_out(group_event, cpuctx, ctx); |
726 | |
727 | if (txn) |
728 | pmu->cancel_txn(pmu); |
729 | |
730 | return -EAGAIN; |
731 | } |
732 | |
733 | /* |
734 | * Work out whether we can put this event group on the CPU now. |
735 | */ |
736 | static int group_can_go_on(struct perf_event *event, |
737 | struct perf_cpu_context *cpuctx, |
738 | int can_add_hw) |
739 | { |
740 | /* |
741 | * Groups consisting entirely of software events can always go on. |
742 | */ |
743 | if (event->group_flags & PERF_GROUP_SOFTWARE) |
744 | return 1; |
745 | /* |
746 | * If an exclusive group is already on, no other hardware |
747 | * events can go on. |
748 | */ |
749 | if (cpuctx->exclusive) |
750 | return 0; |
751 | /* |
752 | * If this group is exclusive and there are already |
753 | * events on the CPU, it can't go on. |
754 | */ |
755 | if (event->attr.exclusive && cpuctx->active_oncpu) |
756 | return 0; |
757 | /* |
758 | * Otherwise, try to add it if all previous groups were able |
759 | * to go on. |
760 | */ |
761 | return can_add_hw; |
762 | } |
763 | |
764 | static void add_event_to_ctx(struct perf_event *event, |
765 | struct perf_event_context *ctx) |
766 | { |
767 | list_add_event(event, ctx); |
768 | perf_group_attach(event); |
769 | event->tstamp_enabled = ctx->time; |
770 | event->tstamp_running = ctx->time; |
771 | event->tstamp_stopped = ctx->time; |
772 | } |
773 | |
774 | /* |
775 | * Cross CPU call to install and enable a performance event |
776 | * |
777 | * Must be called with ctx->mutex held |
778 | */ |
779 | static void __perf_install_in_context(void *info) |
780 | { |
781 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
782 | struct perf_event *event = info; |
783 | struct perf_event_context *ctx = event->ctx; |
784 | struct perf_event *leader = event->group_leader; |
785 | int err; |
786 | |
787 | /* |
788 | * If this is a task context, we need to check whether it is |
789 | * the current task context of this cpu. If not it has been |
790 | * scheduled out before the smp call arrived. |
791 | * Or possibly this is the right context but it isn't |
792 | * on this cpu because it had no events. |
793 | */ |
794 | if (ctx->task && cpuctx->task_ctx != ctx) { |
795 | if (cpuctx->task_ctx || ctx->task != current) |
796 | return; |
797 | cpuctx->task_ctx = ctx; |
798 | } |
799 | |
800 | raw_spin_lock(&ctx->lock); |
801 | ctx->is_active = 1; |
802 | update_context_time(ctx); |
803 | |
804 | /* |
805 | * Protect the list operation against NMI by disabling the |
806 | * events on a global level. NOP for non NMI based events. |
807 | */ |
808 | perf_disable(); |
809 | |
810 | add_event_to_ctx(event, ctx); |
811 | |
812 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
813 | goto unlock; |
814 | |
815 | /* |
816 | * Don't put the event on if it is disabled or if |
817 | * it is in a group and the group isn't on. |
818 | */ |
819 | if (event->state != PERF_EVENT_STATE_INACTIVE || |
820 | (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) |
821 | goto unlock; |
822 | |
823 | /* |
824 | * An exclusive event can't go on if there are already active |
825 | * hardware events, and no hardware event can go on if there |
826 | * is already an exclusive event on. |
827 | */ |
828 | if (!group_can_go_on(event, cpuctx, 1)) |
829 | err = -EEXIST; |
830 | else |
831 | err = event_sched_in(event, cpuctx, ctx); |
832 | |
833 | if (err) { |
834 | /* |
835 | * This event couldn't go on. If it is in a group |
836 | * then we have to pull the whole group off. |
837 | * If the event group is pinned then put it in error state. |
838 | */ |
839 | if (leader != event) |
840 | group_sched_out(leader, cpuctx, ctx); |
841 | if (leader->attr.pinned) { |
842 | update_group_times(leader); |
843 | leader->state = PERF_EVENT_STATE_ERROR; |
844 | } |
845 | } |
846 | |
847 | if (!err && !ctx->task && cpuctx->max_pertask) |
848 | cpuctx->max_pertask--; |
849 | |
850 | unlock: |
851 | perf_enable(); |
852 | |
853 | raw_spin_unlock(&ctx->lock); |
854 | } |
855 | |
856 | /* |
857 | * Attach a performance event to a context |
858 | * |
859 | * First we add the event to the list with the hardware enable bit |
860 | * in event->hw_config cleared. |
861 | * |
862 | * If the event is attached to a task which is on a CPU we use a smp |
863 | * call to enable it in the task context. The task might have been |
864 | * scheduled away, but we check this in the smp call again. |
865 | * |
866 | * Must be called with ctx->mutex held. |
867 | */ |
868 | static void |
869 | perf_install_in_context(struct perf_event_context *ctx, |
870 | struct perf_event *event, |
871 | int cpu) |
872 | { |
873 | struct task_struct *task = ctx->task; |
874 | |
875 | if (!task) { |
876 | /* |
877 | * Per cpu events are installed via an smp call and |
878 | * the install is always successful. |
879 | */ |
880 | smp_call_function_single(cpu, __perf_install_in_context, |
881 | event, 1); |
882 | return; |
883 | } |
884 | |
885 | retry: |
886 | task_oncpu_function_call(task, __perf_install_in_context, |
887 | event); |
888 | |
889 | raw_spin_lock_irq(&ctx->lock); |
890 | /* |
891 | * we need to retry the smp call. |
892 | */ |
893 | if (ctx->is_active && list_empty(&event->group_entry)) { |
894 | raw_spin_unlock_irq(&ctx->lock); |
895 | goto retry; |
896 | } |
897 | |
898 | /* |
899 | * The lock prevents that this context is scheduled in so we |
900 | * can add the event safely, if it the call above did not |
901 | * succeed. |
902 | */ |
903 | if (list_empty(&event->group_entry)) |
904 | add_event_to_ctx(event, ctx); |
905 | raw_spin_unlock_irq(&ctx->lock); |
906 | } |
907 | |
908 | /* |
909 | * Put a event into inactive state and update time fields. |
910 | * Enabling the leader of a group effectively enables all |
911 | * the group members that aren't explicitly disabled, so we |
912 | * have to update their ->tstamp_enabled also. |
913 | * Note: this works for group members as well as group leaders |
914 | * since the non-leader members' sibling_lists will be empty. |
915 | */ |
916 | static void __perf_event_mark_enabled(struct perf_event *event, |
917 | struct perf_event_context *ctx) |
918 | { |
919 | struct perf_event *sub; |
920 | |
921 | event->state = PERF_EVENT_STATE_INACTIVE; |
922 | event->tstamp_enabled = ctx->time - event->total_time_enabled; |
923 | list_for_each_entry(sub, &event->sibling_list, group_entry) |
924 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) |
925 | sub->tstamp_enabled = |
926 | ctx->time - sub->total_time_enabled; |
927 | } |
928 | |
929 | /* |
930 | * Cross CPU call to enable a performance event |
931 | */ |
932 | static void __perf_event_enable(void *info) |
933 | { |
934 | struct perf_event *event = info; |
935 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
936 | struct perf_event_context *ctx = event->ctx; |
937 | struct perf_event *leader = event->group_leader; |
938 | int err; |
939 | |
940 | /* |
941 | * If this is a per-task event, need to check whether this |
942 | * event's task is the current task on this cpu. |
943 | */ |
944 | if (ctx->task && cpuctx->task_ctx != ctx) { |
945 | if (cpuctx->task_ctx || ctx->task != current) |
946 | return; |
947 | cpuctx->task_ctx = ctx; |
948 | } |
949 | |
950 | raw_spin_lock(&ctx->lock); |
951 | ctx->is_active = 1; |
952 | update_context_time(ctx); |
953 | |
954 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
955 | goto unlock; |
956 | __perf_event_mark_enabled(event, ctx); |
957 | |
958 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
959 | goto unlock; |
960 | |
961 | /* |
962 | * If the event is in a group and isn't the group leader, |
963 | * then don't put it on unless the group is on. |
964 | */ |
965 | if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) |
966 | goto unlock; |
967 | |
968 | if (!group_can_go_on(event, cpuctx, 1)) { |
969 | err = -EEXIST; |
970 | } else { |
971 | perf_disable(); |
972 | if (event == leader) |
973 | err = group_sched_in(event, cpuctx, ctx); |
974 | else |
975 | err = event_sched_in(event, cpuctx, ctx); |
976 | perf_enable(); |
977 | } |
978 | |
979 | if (err) { |
980 | /* |
981 | * If this event can't go on and it's part of a |
982 | * group, then the whole group has to come off. |
983 | */ |
984 | if (leader != event) |
985 | group_sched_out(leader, cpuctx, ctx); |
986 | if (leader->attr.pinned) { |
987 | update_group_times(leader); |
988 | leader->state = PERF_EVENT_STATE_ERROR; |
989 | } |
990 | } |
991 | |
992 | unlock: |
993 | raw_spin_unlock(&ctx->lock); |
994 | } |
995 | |
996 | /* |
997 | * Enable a event. |
998 | * |
999 | * If event->ctx is a cloned context, callers must make sure that |
1000 | * every task struct that event->ctx->task could possibly point to |
1001 | * remains valid. This condition is satisfied when called through |
1002 | * perf_event_for_each_child or perf_event_for_each as described |
1003 | * for perf_event_disable. |
1004 | */ |
1005 | void perf_event_enable(struct perf_event *event) |
1006 | { |
1007 | struct perf_event_context *ctx = event->ctx; |
1008 | struct task_struct *task = ctx->task; |
1009 | |
1010 | if (!task) { |
1011 | /* |
1012 | * Enable the event on the cpu that it's on |
1013 | */ |
1014 | smp_call_function_single(event->cpu, __perf_event_enable, |
1015 | event, 1); |
1016 | return; |
1017 | } |
1018 | |
1019 | raw_spin_lock_irq(&ctx->lock); |
1020 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
1021 | goto out; |
1022 | |
1023 | /* |
1024 | * If the event is in error state, clear that first. |
1025 | * That way, if we see the event in error state below, we |
1026 | * know that it has gone back into error state, as distinct |
1027 | * from the task having been scheduled away before the |
1028 | * cross-call arrived. |
1029 | */ |
1030 | if (event->state == PERF_EVENT_STATE_ERROR) |
1031 | event->state = PERF_EVENT_STATE_OFF; |
1032 | |
1033 | retry: |
1034 | raw_spin_unlock_irq(&ctx->lock); |
1035 | task_oncpu_function_call(task, __perf_event_enable, event); |
1036 | |
1037 | raw_spin_lock_irq(&ctx->lock); |
1038 | |
1039 | /* |
1040 | * If the context is active and the event is still off, |
1041 | * we need to retry the cross-call. |
1042 | */ |
1043 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) |
1044 | goto retry; |
1045 | |
1046 | /* |
1047 | * Since we have the lock this context can't be scheduled |
1048 | * in, so we can change the state safely. |
1049 | */ |
1050 | if (event->state == PERF_EVENT_STATE_OFF) |
1051 | __perf_event_mark_enabled(event, ctx); |
1052 | |
1053 | out: |
1054 | raw_spin_unlock_irq(&ctx->lock); |
1055 | } |
1056 | |
1057 | static int perf_event_refresh(struct perf_event *event, int refresh) |
1058 | { |
1059 | /* |
1060 | * not supported on inherited events |
1061 | */ |
1062 | if (event->attr.inherit) |
1063 | return -EINVAL; |
1064 | |
1065 | atomic_add(refresh, &event->event_limit); |
1066 | perf_event_enable(event); |
1067 | |
1068 | return 0; |
1069 | } |
1070 | |
1071 | enum event_type_t { |
1072 | EVENT_FLEXIBLE = 0x1, |
1073 | EVENT_PINNED = 0x2, |
1074 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, |
1075 | }; |
1076 | |
1077 | static void ctx_sched_out(struct perf_event_context *ctx, |
1078 | struct perf_cpu_context *cpuctx, |
1079 | enum event_type_t event_type) |
1080 | { |
1081 | struct perf_event *event; |
1082 | |
1083 | raw_spin_lock(&ctx->lock); |
1084 | ctx->is_active = 0; |
1085 | if (likely(!ctx->nr_events)) |
1086 | goto out; |
1087 | update_context_time(ctx); |
1088 | |
1089 | perf_disable(); |
1090 | if (!ctx->nr_active) |
1091 | goto out_enable; |
1092 | |
1093 | if (event_type & EVENT_PINNED) |
1094 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
1095 | group_sched_out(event, cpuctx, ctx); |
1096 | |
1097 | if (event_type & EVENT_FLEXIBLE) |
1098 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
1099 | group_sched_out(event, cpuctx, ctx); |
1100 | |
1101 | out_enable: |
1102 | perf_enable(); |
1103 | out: |
1104 | raw_spin_unlock(&ctx->lock); |
1105 | } |
1106 | |
1107 | /* |
1108 | * Test whether two contexts are equivalent, i.e. whether they |
1109 | * have both been cloned from the same version of the same context |
1110 | * and they both have the same number of enabled events. |
1111 | * If the number of enabled events is the same, then the set |
1112 | * of enabled events should be the same, because these are both |
1113 | * inherited contexts, therefore we can't access individual events |
1114 | * in them directly with an fd; we can only enable/disable all |
1115 | * events via prctl, or enable/disable all events in a family |
1116 | * via ioctl, which will have the same effect on both contexts. |
1117 | */ |
1118 | static int context_equiv(struct perf_event_context *ctx1, |
1119 | struct perf_event_context *ctx2) |
1120 | { |
1121 | return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx |
1122 | && ctx1->parent_gen == ctx2->parent_gen |
1123 | && !ctx1->pin_count && !ctx2->pin_count; |
1124 | } |
1125 | |
1126 | static void __perf_event_sync_stat(struct perf_event *event, |
1127 | struct perf_event *next_event) |
1128 | { |
1129 | u64 value; |
1130 | |
1131 | if (!event->attr.inherit_stat) |
1132 | return; |
1133 | |
1134 | /* |
1135 | * Update the event value, we cannot use perf_event_read() |
1136 | * because we're in the middle of a context switch and have IRQs |
1137 | * disabled, which upsets smp_call_function_single(), however |
1138 | * we know the event must be on the current CPU, therefore we |
1139 | * don't need to use it. |
1140 | */ |
1141 | switch (event->state) { |
1142 | case PERF_EVENT_STATE_ACTIVE: |
1143 | event->pmu->read(event); |
1144 | /* fall-through */ |
1145 | |
1146 | case PERF_EVENT_STATE_INACTIVE: |
1147 | update_event_times(event); |
1148 | break; |
1149 | |
1150 | default: |
1151 | break; |
1152 | } |
1153 | |
1154 | /* |
1155 | * In order to keep per-task stats reliable we need to flip the event |
1156 | * values when we flip the contexts. |
1157 | */ |
1158 | value = atomic64_read(&next_event->count); |
1159 | value = atomic64_xchg(&event->count, value); |
1160 | atomic64_set(&next_event->count, value); |
1161 | |
1162 | swap(event->total_time_enabled, next_event->total_time_enabled); |
1163 | swap(event->total_time_running, next_event->total_time_running); |
1164 | |
1165 | /* |
1166 | * Since we swizzled the values, update the user visible data too. |
1167 | */ |
1168 | perf_event_update_userpage(event); |
1169 | perf_event_update_userpage(next_event); |
1170 | } |
1171 | |
1172 | #define list_next_entry(pos, member) \ |
1173 | list_entry(pos->member.next, typeof(*pos), member) |
1174 | |
1175 | static void perf_event_sync_stat(struct perf_event_context *ctx, |
1176 | struct perf_event_context *next_ctx) |
1177 | { |
1178 | struct perf_event *event, *next_event; |
1179 | |
1180 | if (!ctx->nr_stat) |
1181 | return; |
1182 | |
1183 | update_context_time(ctx); |
1184 | |
1185 | event = list_first_entry(&ctx->event_list, |
1186 | struct perf_event, event_entry); |
1187 | |
1188 | next_event = list_first_entry(&next_ctx->event_list, |
1189 | struct perf_event, event_entry); |
1190 | |
1191 | while (&event->event_entry != &ctx->event_list && |
1192 | &next_event->event_entry != &next_ctx->event_list) { |
1193 | |
1194 | __perf_event_sync_stat(event, next_event); |
1195 | |
1196 | event = list_next_entry(event, event_entry); |
1197 | next_event = list_next_entry(next_event, event_entry); |
1198 | } |
1199 | } |
1200 | |
1201 | /* |
1202 | * Called from scheduler to remove the events of the current task, |
1203 | * with interrupts disabled. |
1204 | * |
1205 | * We stop each event and update the event value in event->count. |
1206 | * |
1207 | * This does not protect us against NMI, but disable() |
1208 | * sets the disabled bit in the control field of event _before_ |
1209 | * accessing the event control register. If a NMI hits, then it will |
1210 | * not restart the event. |
1211 | */ |
1212 | void perf_event_task_sched_out(struct task_struct *task, |
1213 | struct task_struct *next) |
1214 | { |
1215 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
1216 | struct perf_event_context *ctx = task->perf_event_ctxp; |
1217 | struct perf_event_context *next_ctx; |
1218 | struct perf_event_context *parent; |
1219 | int do_switch = 1; |
1220 | |
1221 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); |
1222 | |
1223 | if (likely(!ctx || !cpuctx->task_ctx)) |
1224 | return; |
1225 | |
1226 | rcu_read_lock(); |
1227 | parent = rcu_dereference(ctx->parent_ctx); |
1228 | next_ctx = next->perf_event_ctxp; |
1229 | if (parent && next_ctx && |
1230 | rcu_dereference(next_ctx->parent_ctx) == parent) { |
1231 | /* |
1232 | * Looks like the two contexts are clones, so we might be |
1233 | * able to optimize the context switch. We lock both |
1234 | * contexts and check that they are clones under the |
1235 | * lock (including re-checking that neither has been |
1236 | * uncloned in the meantime). It doesn't matter which |
1237 | * order we take the locks because no other cpu could |
1238 | * be trying to lock both of these tasks. |
1239 | */ |
1240 | raw_spin_lock(&ctx->lock); |
1241 | raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); |
1242 | if (context_equiv(ctx, next_ctx)) { |
1243 | /* |
1244 | * XXX do we need a memory barrier of sorts |
1245 | * wrt to rcu_dereference() of perf_event_ctxp |
1246 | */ |
1247 | task->perf_event_ctxp = next_ctx; |
1248 | next->perf_event_ctxp = ctx; |
1249 | ctx->task = next; |
1250 | next_ctx->task = task; |
1251 | do_switch = 0; |
1252 | |
1253 | perf_event_sync_stat(ctx, next_ctx); |
1254 | } |
1255 | raw_spin_unlock(&next_ctx->lock); |
1256 | raw_spin_unlock(&ctx->lock); |
1257 | } |
1258 | rcu_read_unlock(); |
1259 | |
1260 | if (do_switch) { |
1261 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); |
1262 | cpuctx->task_ctx = NULL; |
1263 | } |
1264 | } |
1265 | |
1266 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
1267 | enum event_type_t event_type) |
1268 | { |
1269 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
1270 | |
1271 | if (!cpuctx->task_ctx) |
1272 | return; |
1273 | |
1274 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) |
1275 | return; |
1276 | |
1277 | ctx_sched_out(ctx, cpuctx, event_type); |
1278 | cpuctx->task_ctx = NULL; |
1279 | } |
1280 | |
1281 | /* |
1282 | * Called with IRQs disabled |
1283 | */ |
1284 | static void __perf_event_task_sched_out(struct perf_event_context *ctx) |
1285 | { |
1286 | task_ctx_sched_out(ctx, EVENT_ALL); |
1287 | } |
1288 | |
1289 | /* |
1290 | * Called with IRQs disabled |
1291 | */ |
1292 | static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, |
1293 | enum event_type_t event_type) |
1294 | { |
1295 | ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); |
1296 | } |
1297 | |
1298 | static void |
1299 | ctx_pinned_sched_in(struct perf_event_context *ctx, |
1300 | struct perf_cpu_context *cpuctx) |
1301 | { |
1302 | struct perf_event *event; |
1303 | |
1304 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
1305 | if (event->state <= PERF_EVENT_STATE_OFF) |
1306 | continue; |
1307 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1308 | continue; |
1309 | |
1310 | if (group_can_go_on(event, cpuctx, 1)) |
1311 | group_sched_in(event, cpuctx, ctx); |
1312 | |
1313 | /* |
1314 | * If this pinned group hasn't been scheduled, |
1315 | * put it in error state. |
1316 | */ |
1317 | if (event->state == PERF_EVENT_STATE_INACTIVE) { |
1318 | update_group_times(event); |
1319 | event->state = PERF_EVENT_STATE_ERROR; |
1320 | } |
1321 | } |
1322 | } |
1323 | |
1324 | static void |
1325 | ctx_flexible_sched_in(struct perf_event_context *ctx, |
1326 | struct perf_cpu_context *cpuctx) |
1327 | { |
1328 | struct perf_event *event; |
1329 | int can_add_hw = 1; |
1330 | |
1331 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { |
1332 | /* Ignore events in OFF or ERROR state */ |
1333 | if (event->state <= PERF_EVENT_STATE_OFF) |
1334 | continue; |
1335 | /* |
1336 | * Listen to the 'cpu' scheduling filter constraint |
1337 | * of events: |
1338 | */ |
1339 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1340 | continue; |
1341 | |
1342 | if (group_can_go_on(event, cpuctx, can_add_hw)) |
1343 | if (group_sched_in(event, cpuctx, ctx)) |
1344 | can_add_hw = 0; |
1345 | } |
1346 | } |
1347 | |
1348 | static void |
1349 | ctx_sched_in(struct perf_event_context *ctx, |
1350 | struct perf_cpu_context *cpuctx, |
1351 | enum event_type_t event_type) |
1352 | { |
1353 | raw_spin_lock(&ctx->lock); |
1354 | ctx->is_active = 1; |
1355 | if (likely(!ctx->nr_events)) |
1356 | goto out; |
1357 | |
1358 | ctx->timestamp = perf_clock(); |
1359 | |
1360 | perf_disable(); |
1361 | |
1362 | /* |
1363 | * First go through the list and put on any pinned groups |
1364 | * in order to give them the best chance of going on. |
1365 | */ |
1366 | if (event_type & EVENT_PINNED) |
1367 | ctx_pinned_sched_in(ctx, cpuctx); |
1368 | |
1369 | /* Then walk through the lower prio flexible groups */ |
1370 | if (event_type & EVENT_FLEXIBLE) |
1371 | ctx_flexible_sched_in(ctx, cpuctx); |
1372 | |
1373 | perf_enable(); |
1374 | out: |
1375 | raw_spin_unlock(&ctx->lock); |
1376 | } |
1377 | |
1378 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
1379 | enum event_type_t event_type) |
1380 | { |
1381 | struct perf_event_context *ctx = &cpuctx->ctx; |
1382 | |
1383 | ctx_sched_in(ctx, cpuctx, event_type); |
1384 | } |
1385 | |
1386 | static void task_ctx_sched_in(struct task_struct *task, |
1387 | enum event_type_t event_type) |
1388 | { |
1389 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
1390 | struct perf_event_context *ctx = task->perf_event_ctxp; |
1391 | |
1392 | if (likely(!ctx)) |
1393 | return; |
1394 | if (cpuctx->task_ctx == ctx) |
1395 | return; |
1396 | ctx_sched_in(ctx, cpuctx, event_type); |
1397 | cpuctx->task_ctx = ctx; |
1398 | } |
1399 | /* |
1400 | * Called from scheduler to add the events of the current task |
1401 | * with interrupts disabled. |
1402 | * |
1403 | * We restore the event value and then enable it. |
1404 | * |
1405 | * This does not protect us against NMI, but enable() |
1406 | * sets the enabled bit in the control field of event _before_ |
1407 | * accessing the event control register. If a NMI hits, then it will |
1408 | * keep the event running. |
1409 | */ |
1410 | void perf_event_task_sched_in(struct task_struct *task) |
1411 | { |
1412 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
1413 | struct perf_event_context *ctx = task->perf_event_ctxp; |
1414 | |
1415 | if (likely(!ctx)) |
1416 | return; |
1417 | |
1418 | if (cpuctx->task_ctx == ctx) |
1419 | return; |
1420 | |
1421 | perf_disable(); |
1422 | |
1423 | /* |
1424 | * We want to keep the following priority order: |
1425 | * cpu pinned (that don't need to move), task pinned, |
1426 | * cpu flexible, task flexible. |
1427 | */ |
1428 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1429 | |
1430 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED); |
1431 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
1432 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); |
1433 | |
1434 | cpuctx->task_ctx = ctx; |
1435 | |
1436 | perf_enable(); |
1437 | } |
1438 | |
1439 | #define MAX_INTERRUPTS (~0ULL) |
1440 | |
1441 | static void perf_log_throttle(struct perf_event *event, int enable); |
1442 | |
1443 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
1444 | { |
1445 | u64 frequency = event->attr.sample_freq; |
1446 | u64 sec = NSEC_PER_SEC; |
1447 | u64 divisor, dividend; |
1448 | |
1449 | int count_fls, nsec_fls, frequency_fls, sec_fls; |
1450 | |
1451 | count_fls = fls64(count); |
1452 | nsec_fls = fls64(nsec); |
1453 | frequency_fls = fls64(frequency); |
1454 | sec_fls = 30; |
1455 | |
1456 | /* |
1457 | * We got @count in @nsec, with a target of sample_freq HZ |
1458 | * the target period becomes: |
1459 | * |
1460 | * @count * 10^9 |
1461 | * period = ------------------- |
1462 | * @nsec * sample_freq |
1463 | * |
1464 | */ |
1465 | |
1466 | /* |
1467 | * Reduce accuracy by one bit such that @a and @b converge |
1468 | * to a similar magnitude. |
1469 | */ |
1470 | #define REDUCE_FLS(a, b) \ |
1471 | do { \ |
1472 | if (a##_fls > b##_fls) { \ |
1473 | a >>= 1; \ |
1474 | a##_fls--; \ |
1475 | } else { \ |
1476 | b >>= 1; \ |
1477 | b##_fls--; \ |
1478 | } \ |
1479 | } while (0) |
1480 | |
1481 | /* |
1482 | * Reduce accuracy until either term fits in a u64, then proceed with |
1483 | * the other, so that finally we can do a u64/u64 division. |
1484 | */ |
1485 | while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { |
1486 | REDUCE_FLS(nsec, frequency); |
1487 | REDUCE_FLS(sec, count); |
1488 | } |
1489 | |
1490 | if (count_fls + sec_fls > 64) { |
1491 | divisor = nsec * frequency; |
1492 | |
1493 | while (count_fls + sec_fls > 64) { |
1494 | REDUCE_FLS(count, sec); |
1495 | divisor >>= 1; |
1496 | } |
1497 | |
1498 | dividend = count * sec; |
1499 | } else { |
1500 | dividend = count * sec; |
1501 | |
1502 | while (nsec_fls + frequency_fls > 64) { |
1503 | REDUCE_FLS(nsec, frequency); |
1504 | dividend >>= 1; |
1505 | } |
1506 | |
1507 | divisor = nsec * frequency; |
1508 | } |
1509 | |
1510 | if (!divisor) |
1511 | return dividend; |
1512 | |
1513 | return div64_u64(dividend, divisor); |
1514 | } |
1515 | |
1516 | static void perf_event_stop(struct perf_event *event) |
1517 | { |
1518 | if (!event->pmu->stop) |
1519 | return event->pmu->disable(event); |
1520 | |
1521 | return event->pmu->stop(event); |
1522 | } |
1523 | |
1524 | static int perf_event_start(struct perf_event *event) |
1525 | { |
1526 | if (!event->pmu->start) |
1527 | return event->pmu->enable(event); |
1528 | |
1529 | return event->pmu->start(event); |
1530 | } |
1531 | |
1532 | static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) |
1533 | { |
1534 | struct hw_perf_event *hwc = &event->hw; |
1535 | s64 period, sample_period; |
1536 | s64 delta; |
1537 | |
1538 | period = perf_calculate_period(event, nsec, count); |
1539 | |
1540 | delta = (s64)(period - hwc->sample_period); |
1541 | delta = (delta + 7) / 8; /* low pass filter */ |
1542 | |
1543 | sample_period = hwc->sample_period + delta; |
1544 | |
1545 | if (!sample_period) |
1546 | sample_period = 1; |
1547 | |
1548 | hwc->sample_period = sample_period; |
1549 | |
1550 | if (atomic64_read(&hwc->period_left) > 8*sample_period) { |
1551 | perf_disable(); |
1552 | perf_event_stop(event); |
1553 | atomic64_set(&hwc->period_left, 0); |
1554 | perf_event_start(event); |
1555 | perf_enable(); |
1556 | } |
1557 | } |
1558 | |
1559 | static void perf_ctx_adjust_freq(struct perf_event_context *ctx) |
1560 | { |
1561 | struct perf_event *event; |
1562 | struct hw_perf_event *hwc; |
1563 | u64 interrupts, now; |
1564 | s64 delta; |
1565 | |
1566 | raw_spin_lock(&ctx->lock); |
1567 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
1568 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
1569 | continue; |
1570 | |
1571 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
1572 | continue; |
1573 | |
1574 | hwc = &event->hw; |
1575 | |
1576 | interrupts = hwc->interrupts; |
1577 | hwc->interrupts = 0; |
1578 | |
1579 | /* |
1580 | * unthrottle events on the tick |
1581 | */ |
1582 | if (interrupts == MAX_INTERRUPTS) { |
1583 | perf_log_throttle(event, 1); |
1584 | perf_disable(); |
1585 | event->pmu->unthrottle(event); |
1586 | perf_enable(); |
1587 | } |
1588 | |
1589 | if (!event->attr.freq || !event->attr.sample_freq) |
1590 | continue; |
1591 | |
1592 | perf_disable(); |
1593 | event->pmu->read(event); |
1594 | now = atomic64_read(&event->count); |
1595 | delta = now - hwc->freq_count_stamp; |
1596 | hwc->freq_count_stamp = now; |
1597 | |
1598 | if (delta > 0) |
1599 | perf_adjust_period(event, TICK_NSEC, delta); |
1600 | perf_enable(); |
1601 | } |
1602 | raw_spin_unlock(&ctx->lock); |
1603 | } |
1604 | |
1605 | /* |
1606 | * Round-robin a context's events: |
1607 | */ |
1608 | static void rotate_ctx(struct perf_event_context *ctx) |
1609 | { |
1610 | raw_spin_lock(&ctx->lock); |
1611 | |
1612 | /* Rotate the first entry last of non-pinned groups */ |
1613 | list_rotate_left(&ctx->flexible_groups); |
1614 | |
1615 | raw_spin_unlock(&ctx->lock); |
1616 | } |
1617 | |
1618 | void perf_event_task_tick(struct task_struct *curr) |
1619 | { |
1620 | struct perf_cpu_context *cpuctx; |
1621 | struct perf_event_context *ctx; |
1622 | int rotate = 0; |
1623 | |
1624 | if (!atomic_read(&nr_events)) |
1625 | return; |
1626 | |
1627 | cpuctx = &__get_cpu_var(perf_cpu_context); |
1628 | if (cpuctx->ctx.nr_events && |
1629 | cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
1630 | rotate = 1; |
1631 | |
1632 | ctx = curr->perf_event_ctxp; |
1633 | if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) |
1634 | rotate = 1; |
1635 | |
1636 | perf_ctx_adjust_freq(&cpuctx->ctx); |
1637 | if (ctx) |
1638 | perf_ctx_adjust_freq(ctx); |
1639 | |
1640 | if (!rotate) |
1641 | return; |
1642 | |
1643 | perf_disable(); |
1644 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1645 | if (ctx) |
1646 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); |
1647 | |
1648 | rotate_ctx(&cpuctx->ctx); |
1649 | if (ctx) |
1650 | rotate_ctx(ctx); |
1651 | |
1652 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); |
1653 | if (ctx) |
1654 | task_ctx_sched_in(curr, EVENT_FLEXIBLE); |
1655 | perf_enable(); |
1656 | } |
1657 | |
1658 | static int event_enable_on_exec(struct perf_event *event, |
1659 | struct perf_event_context *ctx) |
1660 | { |
1661 | if (!event->attr.enable_on_exec) |
1662 | return 0; |
1663 | |
1664 | event->attr.enable_on_exec = 0; |
1665 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
1666 | return 0; |
1667 | |
1668 | __perf_event_mark_enabled(event, ctx); |
1669 | |
1670 | return 1; |
1671 | } |
1672 | |
1673 | /* |
1674 | * Enable all of a task's events that have been marked enable-on-exec. |
1675 | * This expects task == current. |
1676 | */ |
1677 | static void perf_event_enable_on_exec(struct task_struct *task) |
1678 | { |
1679 | struct perf_event_context *ctx; |
1680 | struct perf_event *event; |
1681 | unsigned long flags; |
1682 | int enabled = 0; |
1683 | int ret; |
1684 | |
1685 | local_irq_save(flags); |
1686 | ctx = task->perf_event_ctxp; |
1687 | if (!ctx || !ctx->nr_events) |
1688 | goto out; |
1689 | |
1690 | __perf_event_task_sched_out(ctx); |
1691 | |
1692 | raw_spin_lock(&ctx->lock); |
1693 | |
1694 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
1695 | ret = event_enable_on_exec(event, ctx); |
1696 | if (ret) |
1697 | enabled = 1; |
1698 | } |
1699 | |
1700 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { |
1701 | ret = event_enable_on_exec(event, ctx); |
1702 | if (ret) |
1703 | enabled = 1; |
1704 | } |
1705 | |
1706 | /* |
1707 | * Unclone this context if we enabled any event. |
1708 | */ |
1709 | if (enabled) |
1710 | unclone_ctx(ctx); |
1711 | |
1712 | raw_spin_unlock(&ctx->lock); |
1713 | |
1714 | perf_event_task_sched_in(task); |
1715 | out: |
1716 | local_irq_restore(flags); |
1717 | } |
1718 | |
1719 | /* |
1720 | * Cross CPU call to read the hardware event |
1721 | */ |
1722 | static void __perf_event_read(void *info) |
1723 | { |
1724 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
1725 | struct perf_event *event = info; |
1726 | struct perf_event_context *ctx = event->ctx; |
1727 | |
1728 | /* |
1729 | * If this is a task context, we need to check whether it is |
1730 | * the current task context of this cpu. If not it has been |
1731 | * scheduled out before the smp call arrived. In that case |
1732 | * event->count would have been updated to a recent sample |
1733 | * when the event was scheduled out. |
1734 | */ |
1735 | if (ctx->task && cpuctx->task_ctx != ctx) |
1736 | return; |
1737 | |
1738 | raw_spin_lock(&ctx->lock); |
1739 | update_context_time(ctx); |
1740 | update_event_times(event); |
1741 | raw_spin_unlock(&ctx->lock); |
1742 | |
1743 | event->pmu->read(event); |
1744 | } |
1745 | |
1746 | static u64 perf_event_read(struct perf_event *event) |
1747 | { |
1748 | /* |
1749 | * If event is enabled and currently active on a CPU, update the |
1750 | * value in the event structure: |
1751 | */ |
1752 | if (event->state == PERF_EVENT_STATE_ACTIVE) { |
1753 | smp_call_function_single(event->oncpu, |
1754 | __perf_event_read, event, 1); |
1755 | } else if (event->state == PERF_EVENT_STATE_INACTIVE) { |
1756 | struct perf_event_context *ctx = event->ctx; |
1757 | unsigned long flags; |
1758 | |
1759 | raw_spin_lock_irqsave(&ctx->lock, flags); |
1760 | update_context_time(ctx); |
1761 | update_event_times(event); |
1762 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1763 | } |
1764 | |
1765 | return atomic64_read(&event->count); |
1766 | } |
1767 | |
1768 | /* |
1769 | * Initialize the perf_event context in a task_struct: |
1770 | */ |
1771 | static void |
1772 | __perf_event_init_context(struct perf_event_context *ctx, |
1773 | struct task_struct *task) |
1774 | { |
1775 | raw_spin_lock_init(&ctx->lock); |
1776 | mutex_init(&ctx->mutex); |
1777 | INIT_LIST_HEAD(&ctx->pinned_groups); |
1778 | INIT_LIST_HEAD(&ctx->flexible_groups); |
1779 | INIT_LIST_HEAD(&ctx->event_list); |
1780 | atomic_set(&ctx->refcount, 1); |
1781 | ctx->task = task; |
1782 | } |
1783 | |
1784 | static struct perf_event_context *find_get_context(pid_t pid, int cpu) |
1785 | { |
1786 | struct perf_event_context *ctx; |
1787 | struct perf_cpu_context *cpuctx; |
1788 | struct task_struct *task; |
1789 | unsigned long flags; |
1790 | int err; |
1791 | |
1792 | if (pid == -1 && cpu != -1) { |
1793 | /* Must be root to operate on a CPU event: */ |
1794 | if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) |
1795 | return ERR_PTR(-EACCES); |
1796 | |
1797 | if (cpu < 0 || cpu >= nr_cpumask_bits) |
1798 | return ERR_PTR(-EINVAL); |
1799 | |
1800 | /* |
1801 | * We could be clever and allow to attach a event to an |
1802 | * offline CPU and activate it when the CPU comes up, but |
1803 | * that's for later. |
1804 | */ |
1805 | if (!cpu_online(cpu)) |
1806 | return ERR_PTR(-ENODEV); |
1807 | |
1808 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
1809 | ctx = &cpuctx->ctx; |
1810 | get_ctx(ctx); |
1811 | |
1812 | return ctx; |
1813 | } |
1814 | |
1815 | rcu_read_lock(); |
1816 | if (!pid) |
1817 | task = current; |
1818 | else |
1819 | task = find_task_by_vpid(pid); |
1820 | if (task) |
1821 | get_task_struct(task); |
1822 | rcu_read_unlock(); |
1823 | |
1824 | if (!task) |
1825 | return ERR_PTR(-ESRCH); |
1826 | |
1827 | /* |
1828 | * Can't attach events to a dying task. |
1829 | */ |
1830 | err = -ESRCH; |
1831 | if (task->flags & PF_EXITING) |
1832 | goto errout; |
1833 | |
1834 | /* Reuse ptrace permission checks for now. */ |
1835 | err = -EACCES; |
1836 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) |
1837 | goto errout; |
1838 | |
1839 | retry: |
1840 | ctx = perf_lock_task_context(task, &flags); |
1841 | if (ctx) { |
1842 | unclone_ctx(ctx); |
1843 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1844 | } |
1845 | |
1846 | if (!ctx) { |
1847 | ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); |
1848 | err = -ENOMEM; |
1849 | if (!ctx) |
1850 | goto errout; |
1851 | __perf_event_init_context(ctx, task); |
1852 | get_ctx(ctx); |
1853 | if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { |
1854 | /* |
1855 | * We raced with some other task; use |
1856 | * the context they set. |
1857 | */ |
1858 | kfree(ctx); |
1859 | goto retry; |
1860 | } |
1861 | get_task_struct(task); |
1862 | } |
1863 | |
1864 | put_task_struct(task); |
1865 | return ctx; |
1866 | |
1867 | errout: |
1868 | put_task_struct(task); |
1869 | return ERR_PTR(err); |
1870 | } |
1871 | |
1872 | static void perf_event_free_filter(struct perf_event *event); |
1873 | |
1874 | static void free_event_rcu(struct rcu_head *head) |
1875 | { |
1876 | struct perf_event *event; |
1877 | |
1878 | event = container_of(head, struct perf_event, rcu_head); |
1879 | if (event->ns) |
1880 | put_pid_ns(event->ns); |
1881 | perf_event_free_filter(event); |
1882 | kfree(event); |
1883 | } |
1884 | |
1885 | static void perf_pending_sync(struct perf_event *event); |
1886 | static void perf_mmap_data_put(struct perf_mmap_data *data); |
1887 | |
1888 | static void free_event(struct perf_event *event) |
1889 | { |
1890 | perf_pending_sync(event); |
1891 | |
1892 | if (!event->parent) { |
1893 | atomic_dec(&nr_events); |
1894 | if (event->attr.mmap) |
1895 | atomic_dec(&nr_mmap_events); |
1896 | if (event->attr.comm) |
1897 | atomic_dec(&nr_comm_events); |
1898 | if (event->attr.task) |
1899 | atomic_dec(&nr_task_events); |
1900 | } |
1901 | |
1902 | if (event->data) { |
1903 | perf_mmap_data_put(event->data); |
1904 | event->data = NULL; |
1905 | } |
1906 | |
1907 | if (event->destroy) |
1908 | event->destroy(event); |
1909 | |
1910 | put_ctx(event->ctx); |
1911 | call_rcu(&event->rcu_head, free_event_rcu); |
1912 | } |
1913 | |
1914 | int perf_event_release_kernel(struct perf_event *event) |
1915 | { |
1916 | struct perf_event_context *ctx = event->ctx; |
1917 | |
1918 | /* |
1919 | * Remove from the PMU, can't get re-enabled since we got |
1920 | * here because the last ref went. |
1921 | */ |
1922 | perf_event_disable(event); |
1923 | |
1924 | WARN_ON_ONCE(ctx->parent_ctx); |
1925 | /* |
1926 | * There are two ways this annotation is useful: |
1927 | * |
1928 | * 1) there is a lock recursion from perf_event_exit_task |
1929 | * see the comment there. |
1930 | * |
1931 | * 2) there is a lock-inversion with mmap_sem through |
1932 | * perf_event_read_group(), which takes faults while |
1933 | * holding ctx->mutex, however this is called after |
1934 | * the last filedesc died, so there is no possibility |
1935 | * to trigger the AB-BA case. |
1936 | */ |
1937 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); |
1938 | raw_spin_lock_irq(&ctx->lock); |
1939 | perf_group_detach(event); |
1940 | list_del_event(event, ctx); |
1941 | raw_spin_unlock_irq(&ctx->lock); |
1942 | mutex_unlock(&ctx->mutex); |
1943 | |
1944 | mutex_lock(&event->owner->perf_event_mutex); |
1945 | list_del_init(&event->owner_entry); |
1946 | mutex_unlock(&event->owner->perf_event_mutex); |
1947 | put_task_struct(event->owner); |
1948 | |
1949 | free_event(event); |
1950 | |
1951 | return 0; |
1952 | } |
1953 | EXPORT_SYMBOL_GPL(perf_event_release_kernel); |
1954 | |
1955 | /* |
1956 | * Called when the last reference to the file is gone. |
1957 | */ |
1958 | static int perf_release(struct inode *inode, struct file *file) |
1959 | { |
1960 | struct perf_event *event = file->private_data; |
1961 | |
1962 | file->private_data = NULL; |
1963 | |
1964 | return perf_event_release_kernel(event); |
1965 | } |
1966 | |
1967 | static int perf_event_read_size(struct perf_event *event) |
1968 | { |
1969 | int entry = sizeof(u64); /* value */ |
1970 | int size = 0; |
1971 | int nr = 1; |
1972 | |
1973 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) |
1974 | size += sizeof(u64); |
1975 | |
1976 | if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
1977 | size += sizeof(u64); |
1978 | |
1979 | if (event->attr.read_format & PERF_FORMAT_ID) |
1980 | entry += sizeof(u64); |
1981 | |
1982 | if (event->attr.read_format & PERF_FORMAT_GROUP) { |
1983 | nr += event->group_leader->nr_siblings; |
1984 | size += sizeof(u64); |
1985 | } |
1986 | |
1987 | size += entry * nr; |
1988 | |
1989 | return size; |
1990 | } |
1991 | |
1992 | u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) |
1993 | { |
1994 | struct perf_event *child; |
1995 | u64 total = 0; |
1996 | |
1997 | *enabled = 0; |
1998 | *running = 0; |
1999 | |
2000 | mutex_lock(&event->child_mutex); |
2001 | total += perf_event_read(event); |
2002 | *enabled += event->total_time_enabled + |
2003 | atomic64_read(&event->child_total_time_enabled); |
2004 | *running += event->total_time_running + |
2005 | atomic64_read(&event->child_total_time_running); |
2006 | |
2007 | list_for_each_entry(child, &event->child_list, child_list) { |
2008 | total += perf_event_read(child); |
2009 | *enabled += child->total_time_enabled; |
2010 | *running += child->total_time_running; |
2011 | } |
2012 | mutex_unlock(&event->child_mutex); |
2013 | |
2014 | return total; |
2015 | } |
2016 | EXPORT_SYMBOL_GPL(perf_event_read_value); |
2017 | |
2018 | static int perf_event_read_group(struct perf_event *event, |
2019 | u64 read_format, char __user *buf) |
2020 | { |
2021 | struct perf_event *leader = event->group_leader, *sub; |
2022 | int n = 0, size = 0, ret = -EFAULT; |
2023 | struct perf_event_context *ctx = leader->ctx; |
2024 | u64 values[5]; |
2025 | u64 count, enabled, running; |
2026 | |
2027 | mutex_lock(&ctx->mutex); |
2028 | count = perf_event_read_value(leader, &enabled, &running); |
2029 | |
2030 | values[n++] = 1 + leader->nr_siblings; |
2031 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) |
2032 | values[n++] = enabled; |
2033 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
2034 | values[n++] = running; |
2035 | values[n++] = count; |
2036 | if (read_format & PERF_FORMAT_ID) |
2037 | values[n++] = primary_event_id(leader); |
2038 | |
2039 | size = n * sizeof(u64); |
2040 | |
2041 | if (copy_to_user(buf, values, size)) |
2042 | goto unlock; |
2043 | |
2044 | ret = size; |
2045 | |
2046 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
2047 | n = 0; |
2048 | |
2049 | values[n++] = perf_event_read_value(sub, &enabled, &running); |
2050 | if (read_format & PERF_FORMAT_ID) |
2051 | values[n++] = primary_event_id(sub); |
2052 | |
2053 | size = n * sizeof(u64); |
2054 | |
2055 | if (copy_to_user(buf + ret, values, size)) { |
2056 | ret = -EFAULT; |
2057 | goto unlock; |
2058 | } |
2059 | |
2060 | ret += size; |
2061 | } |
2062 | unlock: |
2063 | mutex_unlock(&ctx->mutex); |
2064 | |
2065 | return ret; |
2066 | } |
2067 | |
2068 | static int perf_event_read_one(struct perf_event *event, |
2069 | u64 read_format, char __user *buf) |
2070 | { |
2071 | u64 enabled, running; |
2072 | u64 values[4]; |
2073 | int n = 0; |
2074 | |
2075 | values[n++] = perf_event_read_value(event, &enabled, &running); |
2076 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) |
2077 | values[n++] = enabled; |
2078 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
2079 | values[n++] = running; |
2080 | if (read_format & PERF_FORMAT_ID) |
2081 | values[n++] = primary_event_id(event); |
2082 | |
2083 | if (copy_to_user(buf, values, n * sizeof(u64))) |
2084 | return -EFAULT; |
2085 | |
2086 | return n * sizeof(u64); |
2087 | } |
2088 | |
2089 | /* |
2090 | * Read the performance event - simple non blocking version for now |
2091 | */ |
2092 | static ssize_t |
2093 | perf_read_hw(struct perf_event *event, char __user *buf, size_t count) |
2094 | { |
2095 | u64 read_format = event->attr.read_format; |
2096 | int ret; |
2097 | |
2098 | /* |
2099 | * Return end-of-file for a read on a event that is in |
2100 | * error state (i.e. because it was pinned but it couldn't be |
2101 | * scheduled on to the CPU at some point). |
2102 | */ |
2103 | if (event->state == PERF_EVENT_STATE_ERROR) |
2104 | return 0; |
2105 | |
2106 | if (count < perf_event_read_size(event)) |
2107 | return -ENOSPC; |
2108 | |
2109 | WARN_ON_ONCE(event->ctx->parent_ctx); |
2110 | if (read_format & PERF_FORMAT_GROUP) |
2111 | ret = perf_event_read_group(event, read_format, buf); |
2112 | else |
2113 | ret = perf_event_read_one(event, read_format, buf); |
2114 | |
2115 | return ret; |
2116 | } |
2117 | |
2118 | static ssize_t |
2119 | perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) |
2120 | { |
2121 | struct perf_event *event = file->private_data; |
2122 | |
2123 | return perf_read_hw(event, buf, count); |
2124 | } |
2125 | |
2126 | static unsigned int perf_poll(struct file *file, poll_table *wait) |
2127 | { |
2128 | struct perf_event *event = file->private_data; |
2129 | struct perf_mmap_data *data; |
2130 | unsigned int events = POLL_HUP; |
2131 | |
2132 | rcu_read_lock(); |
2133 | data = rcu_dereference(event->data); |
2134 | if (data) |
2135 | events = atomic_xchg(&data->poll, 0); |
2136 | rcu_read_unlock(); |
2137 | |
2138 | poll_wait(file, &event->waitq, wait); |
2139 | |
2140 | return events; |
2141 | } |
2142 | |
2143 | static void perf_event_reset(struct perf_event *event) |
2144 | { |
2145 | (void)perf_event_read(event); |
2146 | atomic64_set(&event->count, 0); |
2147 | perf_event_update_userpage(event); |
2148 | } |
2149 | |
2150 | /* |
2151 | * Holding the top-level event's child_mutex means that any |
2152 | * descendant process that has inherited this event will block |
2153 | * in sync_child_event if it goes to exit, thus satisfying the |
2154 | * task existence requirements of perf_event_enable/disable. |
2155 | */ |
2156 | static void perf_event_for_each_child(struct perf_event *event, |
2157 | void (*func)(struct perf_event *)) |
2158 | { |
2159 | struct perf_event *child; |
2160 | |
2161 | WARN_ON_ONCE(event->ctx->parent_ctx); |
2162 | mutex_lock(&event->child_mutex); |
2163 | func(event); |
2164 | list_for_each_entry(child, &event->child_list, child_list) |
2165 | func(child); |
2166 | mutex_unlock(&event->child_mutex); |
2167 | } |
2168 | |
2169 | static void perf_event_for_each(struct perf_event *event, |
2170 | void (*func)(struct perf_event *)) |
2171 | { |
2172 | struct perf_event_context *ctx = event->ctx; |
2173 | struct perf_event *sibling; |
2174 | |
2175 | WARN_ON_ONCE(ctx->parent_ctx); |
2176 | mutex_lock(&ctx->mutex); |
2177 | event = event->group_leader; |
2178 | |
2179 | perf_event_for_each_child(event, func); |
2180 | func(event); |
2181 | list_for_each_entry(sibling, &event->sibling_list, group_entry) |
2182 | perf_event_for_each_child(event, func); |
2183 | mutex_unlock(&ctx->mutex); |
2184 | } |
2185 | |
2186 | static int perf_event_period(struct perf_event *event, u64 __user *arg) |
2187 | { |
2188 | struct perf_event_context *ctx = event->ctx; |
2189 | unsigned long size; |
2190 | int ret = 0; |
2191 | u64 value; |
2192 | |
2193 | if (!event->attr.sample_period) |
2194 | return -EINVAL; |
2195 | |
2196 | size = copy_from_user(&value, arg, sizeof(value)); |
2197 | if (size != sizeof(value)) |
2198 | return -EFAULT; |
2199 | |
2200 | if (!value) |
2201 | return -EINVAL; |
2202 | |
2203 | raw_spin_lock_irq(&ctx->lock); |
2204 | if (event->attr.freq) { |
2205 | if (value > sysctl_perf_event_sample_rate) { |
2206 | ret = -EINVAL; |
2207 | goto unlock; |
2208 | } |
2209 | |
2210 | event->attr.sample_freq = value; |
2211 | } else { |
2212 | event->attr.sample_period = value; |
2213 | event->hw.sample_period = value; |
2214 | } |
2215 | unlock: |
2216 | raw_spin_unlock_irq(&ctx->lock); |
2217 | |
2218 | return ret; |
2219 | } |
2220 | |
2221 | static const struct file_operations perf_fops; |
2222 | |
2223 | static struct perf_event *perf_fget_light(int fd, int *fput_needed) |
2224 | { |
2225 | struct file *file; |
2226 | |
2227 | file = fget_light(fd, fput_needed); |
2228 | if (!file) |
2229 | return ERR_PTR(-EBADF); |
2230 | |
2231 | if (file->f_op != &perf_fops) { |
2232 | fput_light(file, *fput_needed); |
2233 | *fput_needed = 0; |
2234 | return ERR_PTR(-EBADF); |
2235 | } |
2236 | |
2237 | return file->private_data; |
2238 | } |
2239 | |
2240 | static int perf_event_set_output(struct perf_event *event, |
2241 | struct perf_event *output_event); |
2242 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); |
2243 | |
2244 | static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
2245 | { |
2246 | struct perf_event *event = file->private_data; |
2247 | void (*func)(struct perf_event *); |
2248 | u32 flags = arg; |
2249 | |
2250 | switch (cmd) { |
2251 | case PERF_EVENT_IOC_ENABLE: |
2252 | func = perf_event_enable; |
2253 | break; |
2254 | case PERF_EVENT_IOC_DISABLE: |
2255 | func = perf_event_disable; |
2256 | break; |
2257 | case PERF_EVENT_IOC_RESET: |
2258 | func = perf_event_reset; |
2259 | break; |
2260 | |
2261 | case PERF_EVENT_IOC_REFRESH: |
2262 | return perf_event_refresh(event, arg); |
2263 | |
2264 | case PERF_EVENT_IOC_PERIOD: |
2265 | return perf_event_period(event, (u64 __user *)arg); |
2266 | |
2267 | case PERF_EVENT_IOC_SET_OUTPUT: |
2268 | { |
2269 | struct perf_event *output_event = NULL; |
2270 | int fput_needed = 0; |
2271 | int ret; |
2272 | |
2273 | if (arg != -1) { |
2274 | output_event = perf_fget_light(arg, &fput_needed); |
2275 | if (IS_ERR(output_event)) |
2276 | return PTR_ERR(output_event); |
2277 | } |
2278 | |
2279 | ret = perf_event_set_output(event, output_event); |
2280 | if (output_event) |
2281 | fput_light(output_event->filp, fput_needed); |
2282 | |
2283 | return ret; |
2284 | } |
2285 | |
2286 | case PERF_EVENT_IOC_SET_FILTER: |
2287 | return perf_event_set_filter(event, (void __user *)arg); |
2288 | |
2289 | default: |
2290 | return -ENOTTY; |
2291 | } |
2292 | |
2293 | if (flags & PERF_IOC_FLAG_GROUP) |
2294 | perf_event_for_each(event, func); |
2295 | else |
2296 | perf_event_for_each_child(event, func); |
2297 | |
2298 | return 0; |
2299 | } |
2300 | |
2301 | int perf_event_task_enable(void) |
2302 | { |
2303 | struct perf_event *event; |
2304 | |
2305 | mutex_lock(¤t->perf_event_mutex); |
2306 | list_for_each_entry(event, ¤t->perf_event_list, owner_entry) |
2307 | perf_event_for_each_child(event, perf_event_enable); |
2308 | mutex_unlock(¤t->perf_event_mutex); |
2309 | |
2310 | return 0; |
2311 | } |
2312 | |
2313 | int perf_event_task_disable(void) |
2314 | { |
2315 | struct perf_event *event; |
2316 | |
2317 | mutex_lock(¤t->perf_event_mutex); |
2318 | list_for_each_entry(event, ¤t->perf_event_list, owner_entry) |
2319 | perf_event_for_each_child(event, perf_event_disable); |
2320 | mutex_unlock(¤t->perf_event_mutex); |
2321 | |
2322 | return 0; |
2323 | } |
2324 | |
2325 | #ifndef PERF_EVENT_INDEX_OFFSET |
2326 | # define PERF_EVENT_INDEX_OFFSET 0 |
2327 | #endif |
2328 | |
2329 | static int perf_event_index(struct perf_event *event) |
2330 | { |
2331 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2332 | return 0; |
2333 | |
2334 | return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; |
2335 | } |
2336 | |
2337 | /* |
2338 | * Callers need to ensure there can be no nesting of this function, otherwise |
2339 | * the seqlock logic goes bad. We can not serialize this because the arch |
2340 | * code calls this from NMI context. |
2341 | */ |
2342 | void perf_event_update_userpage(struct perf_event *event) |
2343 | { |
2344 | struct perf_event_mmap_page *userpg; |
2345 | struct perf_mmap_data *data; |
2346 | |
2347 | rcu_read_lock(); |
2348 | data = rcu_dereference(event->data); |
2349 | if (!data) |
2350 | goto unlock; |
2351 | |
2352 | userpg = data->user_page; |
2353 | |
2354 | /* |
2355 | * Disable preemption so as to not let the corresponding user-space |
2356 | * spin too long if we get preempted. |
2357 | */ |
2358 | preempt_disable(); |
2359 | ++userpg->lock; |
2360 | barrier(); |
2361 | userpg->index = perf_event_index(event); |
2362 | userpg->offset = atomic64_read(&event->count); |
2363 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
2364 | userpg->offset -= atomic64_read(&event->hw.prev_count); |
2365 | |
2366 | userpg->time_enabled = event->total_time_enabled + |
2367 | atomic64_read(&event->child_total_time_enabled); |
2368 | |
2369 | userpg->time_running = event->total_time_running + |
2370 | atomic64_read(&event->child_total_time_running); |
2371 | |
2372 | barrier(); |
2373 | ++userpg->lock; |
2374 | preempt_enable(); |
2375 | unlock: |
2376 | rcu_read_unlock(); |
2377 | } |
2378 | |
2379 | #ifndef CONFIG_PERF_USE_VMALLOC |
2380 | |
2381 | /* |
2382 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. |
2383 | */ |
2384 | |
2385 | static struct page * |
2386 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) |
2387 | { |
2388 | if (pgoff > data->nr_pages) |
2389 | return NULL; |
2390 | |
2391 | if (pgoff == 0) |
2392 | return virt_to_page(data->user_page); |
2393 | |
2394 | return virt_to_page(data->data_pages[pgoff - 1]); |
2395 | } |
2396 | |
2397 | static void *perf_mmap_alloc_page(int cpu) |
2398 | { |
2399 | struct page *page; |
2400 | int node; |
2401 | |
2402 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); |
2403 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); |
2404 | if (!page) |
2405 | return NULL; |
2406 | |
2407 | return page_address(page); |
2408 | } |
2409 | |
2410 | static struct perf_mmap_data * |
2411 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) |
2412 | { |
2413 | struct perf_mmap_data *data; |
2414 | unsigned long size; |
2415 | int i; |
2416 | |
2417 | size = sizeof(struct perf_mmap_data); |
2418 | size += nr_pages * sizeof(void *); |
2419 | |
2420 | data = kzalloc(size, GFP_KERNEL); |
2421 | if (!data) |
2422 | goto fail; |
2423 | |
2424 | data->user_page = perf_mmap_alloc_page(event->cpu); |
2425 | if (!data->user_page) |
2426 | goto fail_user_page; |
2427 | |
2428 | for (i = 0; i < nr_pages; i++) { |
2429 | data->data_pages[i] = perf_mmap_alloc_page(event->cpu); |
2430 | if (!data->data_pages[i]) |
2431 | goto fail_data_pages; |
2432 | } |
2433 | |
2434 | data->nr_pages = nr_pages; |
2435 | |
2436 | return data; |
2437 | |
2438 | fail_data_pages: |
2439 | for (i--; i >= 0; i--) |
2440 | free_page((unsigned long)data->data_pages[i]); |
2441 | |
2442 | free_page((unsigned long)data->user_page); |
2443 | |
2444 | fail_user_page: |
2445 | kfree(data); |
2446 | |
2447 | fail: |
2448 | return NULL; |
2449 | } |
2450 | |
2451 | static void perf_mmap_free_page(unsigned long addr) |
2452 | { |
2453 | struct page *page = virt_to_page((void *)addr); |
2454 | |
2455 | page->mapping = NULL; |
2456 | __free_page(page); |
2457 | } |
2458 | |
2459 | static void perf_mmap_data_free(struct perf_mmap_data *data) |
2460 | { |
2461 | int i; |
2462 | |
2463 | perf_mmap_free_page((unsigned long)data->user_page); |
2464 | for (i = 0; i < data->nr_pages; i++) |
2465 | perf_mmap_free_page((unsigned long)data->data_pages[i]); |
2466 | kfree(data); |
2467 | } |
2468 | |
2469 | static inline int page_order(struct perf_mmap_data *data) |
2470 | { |
2471 | return 0; |
2472 | } |
2473 | |
2474 | #else |
2475 | |
2476 | /* |
2477 | * Back perf_mmap() with vmalloc memory. |
2478 | * |
2479 | * Required for architectures that have d-cache aliasing issues. |
2480 | */ |
2481 | |
2482 | static inline int page_order(struct perf_mmap_data *data) |
2483 | { |
2484 | return data->page_order; |
2485 | } |
2486 | |
2487 | static struct page * |
2488 | perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) |
2489 | { |
2490 | if (pgoff > (1UL << page_order(data))) |
2491 | return NULL; |
2492 | |
2493 | return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); |
2494 | } |
2495 | |
2496 | static void perf_mmap_unmark_page(void *addr) |
2497 | { |
2498 | struct page *page = vmalloc_to_page(addr); |
2499 | |
2500 | page->mapping = NULL; |
2501 | } |
2502 | |
2503 | static void perf_mmap_data_free_work(struct work_struct *work) |
2504 | { |
2505 | struct perf_mmap_data *data; |
2506 | void *base; |
2507 | int i, nr; |
2508 | |
2509 | data = container_of(work, struct perf_mmap_data, work); |
2510 | nr = 1 << page_order(data); |
2511 | |
2512 | base = data->user_page; |
2513 | for (i = 0; i < nr + 1; i++) |
2514 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); |
2515 | |
2516 | vfree(base); |
2517 | kfree(data); |
2518 | } |
2519 | |
2520 | static void perf_mmap_data_free(struct perf_mmap_data *data) |
2521 | { |
2522 | schedule_work(&data->work); |
2523 | } |
2524 | |
2525 | static struct perf_mmap_data * |
2526 | perf_mmap_data_alloc(struct perf_event *event, int nr_pages) |
2527 | { |
2528 | struct perf_mmap_data *data; |
2529 | unsigned long size; |
2530 | void *all_buf; |
2531 | |
2532 | size = sizeof(struct perf_mmap_data); |
2533 | size += sizeof(void *); |
2534 | |
2535 | data = kzalloc(size, GFP_KERNEL); |
2536 | if (!data) |
2537 | goto fail; |
2538 | |
2539 | INIT_WORK(&data->work, perf_mmap_data_free_work); |
2540 | |
2541 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); |
2542 | if (!all_buf) |
2543 | goto fail_all_buf; |
2544 | |
2545 | data->user_page = all_buf; |
2546 | data->data_pages[0] = all_buf + PAGE_SIZE; |
2547 | data->page_order = ilog2(nr_pages); |
2548 | data->nr_pages = 1; |
2549 | |
2550 | return data; |
2551 | |
2552 | fail_all_buf: |
2553 | kfree(data); |
2554 | |
2555 | fail: |
2556 | return NULL; |
2557 | } |
2558 | |
2559 | #endif |
2560 | |
2561 | static unsigned long perf_data_size(struct perf_mmap_data *data) |
2562 | { |
2563 | return data->nr_pages << (PAGE_SHIFT + page_order(data)); |
2564 | } |
2565 | |
2566 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
2567 | { |
2568 | struct perf_event *event = vma->vm_file->private_data; |
2569 | struct perf_mmap_data *data; |
2570 | int ret = VM_FAULT_SIGBUS; |
2571 | |
2572 | if (vmf->flags & FAULT_FLAG_MKWRITE) { |
2573 | if (vmf->pgoff == 0) |
2574 | ret = 0; |
2575 | return ret; |
2576 | } |
2577 | |
2578 | rcu_read_lock(); |
2579 | data = rcu_dereference(event->data); |
2580 | if (!data) |
2581 | goto unlock; |
2582 | |
2583 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) |
2584 | goto unlock; |
2585 | |
2586 | vmf->page = perf_mmap_to_page(data, vmf->pgoff); |
2587 | if (!vmf->page) |
2588 | goto unlock; |
2589 | |
2590 | get_page(vmf->page); |
2591 | vmf->page->mapping = vma->vm_file->f_mapping; |
2592 | vmf->page->index = vmf->pgoff; |
2593 | |
2594 | ret = 0; |
2595 | unlock: |
2596 | rcu_read_unlock(); |
2597 | |
2598 | return ret; |
2599 | } |
2600 | |
2601 | static void |
2602 | perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) |
2603 | { |
2604 | long max_size = perf_data_size(data); |
2605 | |
2606 | if (event->attr.watermark) { |
2607 | data->watermark = min_t(long, max_size, |
2608 | event->attr.wakeup_watermark); |
2609 | } |
2610 | |
2611 | if (!data->watermark) |
2612 | data->watermark = max_size / 2; |
2613 | |
2614 | atomic_set(&data->refcount, 1); |
2615 | rcu_assign_pointer(event->data, data); |
2616 | } |
2617 | |
2618 | static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) |
2619 | { |
2620 | struct perf_mmap_data *data; |
2621 | |
2622 | data = container_of(rcu_head, struct perf_mmap_data, rcu_head); |
2623 | perf_mmap_data_free(data); |
2624 | } |
2625 | |
2626 | static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) |
2627 | { |
2628 | struct perf_mmap_data *data; |
2629 | |
2630 | rcu_read_lock(); |
2631 | data = rcu_dereference(event->data); |
2632 | if (data) { |
2633 | if (!atomic_inc_not_zero(&data->refcount)) |
2634 | data = NULL; |
2635 | } |
2636 | rcu_read_unlock(); |
2637 | |
2638 | return data; |
2639 | } |
2640 | |
2641 | static void perf_mmap_data_put(struct perf_mmap_data *data) |
2642 | { |
2643 | if (!atomic_dec_and_test(&data->refcount)) |
2644 | return; |
2645 | |
2646 | call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); |
2647 | } |
2648 | |
2649 | static void perf_mmap_open(struct vm_area_struct *vma) |
2650 | { |
2651 | struct perf_event *event = vma->vm_file->private_data; |
2652 | |
2653 | atomic_inc(&event->mmap_count); |
2654 | } |
2655 | |
2656 | static void perf_mmap_close(struct vm_area_struct *vma) |
2657 | { |
2658 | struct perf_event *event = vma->vm_file->private_data; |
2659 | |
2660 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { |
2661 | unsigned long size = perf_data_size(event->data); |
2662 | struct user_struct *user = event->mmap_user; |
2663 | struct perf_mmap_data *data = event->data; |
2664 | |
2665 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
2666 | vma->vm_mm->locked_vm -= event->mmap_locked; |
2667 | rcu_assign_pointer(event->data, NULL); |
2668 | mutex_unlock(&event->mmap_mutex); |
2669 | |
2670 | perf_mmap_data_put(data); |
2671 | free_uid(user); |
2672 | } |
2673 | } |
2674 | |
2675 | static const struct vm_operations_struct perf_mmap_vmops = { |
2676 | .open = perf_mmap_open, |
2677 | .close = perf_mmap_close, |
2678 | .fault = perf_mmap_fault, |
2679 | .page_mkwrite = perf_mmap_fault, |
2680 | }; |
2681 | |
2682 | static int perf_mmap(struct file *file, struct vm_area_struct *vma) |
2683 | { |
2684 | struct perf_event *event = file->private_data; |
2685 | unsigned long user_locked, user_lock_limit; |
2686 | struct user_struct *user = current_user(); |
2687 | unsigned long locked, lock_limit; |
2688 | struct perf_mmap_data *data; |
2689 | unsigned long vma_size; |
2690 | unsigned long nr_pages; |
2691 | long user_extra, extra; |
2692 | int ret = 0; |
2693 | |
2694 | /* |
2695 | * Don't allow mmap() of inherited per-task counters. This would |
2696 | * create a performance issue due to all children writing to the |
2697 | * same buffer. |
2698 | */ |
2699 | if (event->cpu == -1 && event->attr.inherit) |
2700 | return -EINVAL; |
2701 | |
2702 | if (!(vma->vm_flags & VM_SHARED)) |
2703 | return -EINVAL; |
2704 | |
2705 | vma_size = vma->vm_end - vma->vm_start; |
2706 | nr_pages = (vma_size / PAGE_SIZE) - 1; |
2707 | |
2708 | /* |
2709 | * If we have data pages ensure they're a power-of-two number, so we |
2710 | * can do bitmasks instead of modulo. |
2711 | */ |
2712 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) |
2713 | return -EINVAL; |
2714 | |
2715 | if (vma_size != PAGE_SIZE * (1 + nr_pages)) |
2716 | return -EINVAL; |
2717 | |
2718 | if (vma->vm_pgoff != 0) |
2719 | return -EINVAL; |
2720 | |
2721 | WARN_ON_ONCE(event->ctx->parent_ctx); |
2722 | mutex_lock(&event->mmap_mutex); |
2723 | if (event->data) { |
2724 | if (event->data->nr_pages == nr_pages) |
2725 | atomic_inc(&event->data->refcount); |
2726 | else |
2727 | ret = -EINVAL; |
2728 | goto unlock; |
2729 | } |
2730 | |
2731 | user_extra = nr_pages + 1; |
2732 | user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); |
2733 | |
2734 | /* |
2735 | * Increase the limit linearly with more CPUs: |
2736 | */ |
2737 | user_lock_limit *= num_online_cpus(); |
2738 | |
2739 | user_locked = atomic_long_read(&user->locked_vm) + user_extra; |
2740 | |
2741 | extra = 0; |
2742 | if (user_locked > user_lock_limit) |
2743 | extra = user_locked - user_lock_limit; |
2744 | |
2745 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
2746 | lock_limit >>= PAGE_SHIFT; |
2747 | locked = vma->vm_mm->locked_vm + extra; |
2748 | |
2749 | if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && |
2750 | !capable(CAP_IPC_LOCK)) { |
2751 | ret = -EPERM; |
2752 | goto unlock; |
2753 | } |
2754 | |
2755 | WARN_ON(event->data); |
2756 | |
2757 | data = perf_mmap_data_alloc(event, nr_pages); |
2758 | if (!data) { |
2759 | ret = -ENOMEM; |
2760 | goto unlock; |
2761 | } |
2762 | |
2763 | perf_mmap_data_init(event, data); |
2764 | if (vma->vm_flags & VM_WRITE) |
2765 | event->data->writable = 1; |
2766 | |
2767 | atomic_long_add(user_extra, &user->locked_vm); |
2768 | event->mmap_locked = extra; |
2769 | event->mmap_user = get_current_user(); |
2770 | vma->vm_mm->locked_vm += event->mmap_locked; |
2771 | |
2772 | unlock: |
2773 | if (!ret) |
2774 | atomic_inc(&event->mmap_count); |
2775 | mutex_unlock(&event->mmap_mutex); |
2776 | |
2777 | vma->vm_flags |= VM_RESERVED; |
2778 | vma->vm_ops = &perf_mmap_vmops; |
2779 | |
2780 | return ret; |
2781 | } |
2782 | |
2783 | static int perf_fasync(int fd, struct file *filp, int on) |
2784 | { |
2785 | struct inode *inode = filp->f_path.dentry->d_inode; |
2786 | struct perf_event *event = filp->private_data; |
2787 | int retval; |
2788 | |
2789 | mutex_lock(&inode->i_mutex); |
2790 | retval = fasync_helper(fd, filp, on, &event->fasync); |
2791 | mutex_unlock(&inode->i_mutex); |
2792 | |
2793 | if (retval < 0) |
2794 | return retval; |
2795 | |
2796 | return 0; |
2797 | } |
2798 | |
2799 | static const struct file_operations perf_fops = { |
2800 | .llseek = no_llseek, |
2801 | .release = perf_release, |
2802 | .read = perf_read, |
2803 | .poll = perf_poll, |
2804 | .unlocked_ioctl = perf_ioctl, |
2805 | .compat_ioctl = perf_ioctl, |
2806 | .mmap = perf_mmap, |
2807 | .fasync = perf_fasync, |
2808 | }; |
2809 | |
2810 | /* |
2811 | * Perf event wakeup |
2812 | * |
2813 | * If there's data, ensure we set the poll() state and publish everything |
2814 | * to user-space before waking everybody up. |
2815 | */ |
2816 | |
2817 | void perf_event_wakeup(struct perf_event *event) |
2818 | { |
2819 | wake_up_all(&event->waitq); |
2820 | |
2821 | if (event->pending_kill) { |
2822 | kill_fasync(&event->fasync, SIGIO, event->pending_kill); |
2823 | event->pending_kill = 0; |
2824 | } |
2825 | } |
2826 | |
2827 | /* |
2828 | * Pending wakeups |
2829 | * |
2830 | * Handle the case where we need to wakeup up from NMI (or rq->lock) context. |
2831 | * |
2832 | * The NMI bit means we cannot possibly take locks. Therefore, maintain a |
2833 | * single linked list and use cmpxchg() to add entries lockless. |
2834 | */ |
2835 | |
2836 | static void perf_pending_event(struct perf_pending_entry *entry) |
2837 | { |
2838 | struct perf_event *event = container_of(entry, |
2839 | struct perf_event, pending); |
2840 | |
2841 | if (event->pending_disable) { |
2842 | event->pending_disable = 0; |
2843 | __perf_event_disable(event); |
2844 | } |
2845 | |
2846 | if (event->pending_wakeup) { |
2847 | event->pending_wakeup = 0; |
2848 | perf_event_wakeup(event); |
2849 | } |
2850 | } |
2851 | |
2852 | #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) |
2853 | |
2854 | static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { |
2855 | PENDING_TAIL, |
2856 | }; |
2857 | |
2858 | static void perf_pending_queue(struct perf_pending_entry *entry, |
2859 | void (*func)(struct perf_pending_entry *)) |
2860 | { |
2861 | struct perf_pending_entry **head; |
2862 | |
2863 | if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) |
2864 | return; |
2865 | |
2866 | entry->func = func; |
2867 | |
2868 | head = &get_cpu_var(perf_pending_head); |
2869 | |
2870 | do { |
2871 | entry->next = *head; |
2872 | } while (cmpxchg(head, entry->next, entry) != entry->next); |
2873 | |
2874 | set_perf_event_pending(); |
2875 | |
2876 | put_cpu_var(perf_pending_head); |
2877 | } |
2878 | |
2879 | static int __perf_pending_run(void) |
2880 | { |
2881 | struct perf_pending_entry *list; |
2882 | int nr = 0; |
2883 | |
2884 | list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); |
2885 | while (list != PENDING_TAIL) { |
2886 | void (*func)(struct perf_pending_entry *); |
2887 | struct perf_pending_entry *entry = list; |
2888 | |
2889 | list = list->next; |
2890 | |
2891 | func = entry->func; |
2892 | entry->next = NULL; |
2893 | /* |
2894 | * Ensure we observe the unqueue before we issue the wakeup, |
2895 | * so that we won't be waiting forever. |
2896 | * -- see perf_not_pending(). |
2897 | */ |
2898 | smp_wmb(); |
2899 | |
2900 | func(entry); |
2901 | nr++; |
2902 | } |
2903 | |
2904 | return nr; |
2905 | } |
2906 | |
2907 | static inline int perf_not_pending(struct perf_event *event) |
2908 | { |
2909 | /* |
2910 | * If we flush on whatever cpu we run, there is a chance we don't |
2911 | * need to wait. |
2912 | */ |
2913 | get_cpu(); |
2914 | __perf_pending_run(); |
2915 | put_cpu(); |
2916 | |
2917 | /* |
2918 | * Ensure we see the proper queue state before going to sleep |
2919 | * so that we do not miss the wakeup. -- see perf_pending_handle() |
2920 | */ |
2921 | smp_rmb(); |
2922 | return event->pending.next == NULL; |
2923 | } |
2924 | |
2925 | static void perf_pending_sync(struct perf_event *event) |
2926 | { |
2927 | wait_event(event->waitq, perf_not_pending(event)); |
2928 | } |
2929 | |
2930 | void perf_event_do_pending(void) |
2931 | { |
2932 | __perf_pending_run(); |
2933 | } |
2934 | |
2935 | /* |
2936 | * Callchain support -- arch specific |
2937 | */ |
2938 | |
2939 | __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) |
2940 | { |
2941 | return NULL; |
2942 | } |
2943 | |
2944 | __weak |
2945 | void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) |
2946 | { |
2947 | } |
2948 | |
2949 | |
2950 | /* |
2951 | * We assume there is only KVM supporting the callbacks. |
2952 | * Later on, we might change it to a list if there is |
2953 | * another virtualization implementation supporting the callbacks. |
2954 | */ |
2955 | struct perf_guest_info_callbacks *perf_guest_cbs; |
2956 | |
2957 | int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) |
2958 | { |
2959 | perf_guest_cbs = cbs; |
2960 | return 0; |
2961 | } |
2962 | EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); |
2963 | |
2964 | int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) |
2965 | { |
2966 | perf_guest_cbs = NULL; |
2967 | return 0; |
2968 | } |
2969 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); |
2970 | |
2971 | /* |
2972 | * Output |
2973 | */ |
2974 | static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, |
2975 | unsigned long offset, unsigned long head) |
2976 | { |
2977 | unsigned long mask; |
2978 | |
2979 | if (!data->writable) |
2980 | return true; |
2981 | |
2982 | mask = perf_data_size(data) - 1; |
2983 | |
2984 | offset = (offset - tail) & mask; |
2985 | head = (head - tail) & mask; |
2986 | |
2987 | if ((int)(head - offset) < 0) |
2988 | return false; |
2989 | |
2990 | return true; |
2991 | } |
2992 | |
2993 | static void perf_output_wakeup(struct perf_output_handle *handle) |
2994 | { |
2995 | atomic_set(&handle->data->poll, POLL_IN); |
2996 | |
2997 | if (handle->nmi) { |
2998 | handle->event->pending_wakeup = 1; |
2999 | perf_pending_queue(&handle->event->pending, |
3000 | perf_pending_event); |
3001 | } else |
3002 | perf_event_wakeup(handle->event); |
3003 | } |
3004 | |
3005 | /* |
3006 | * We need to ensure a later event_id doesn't publish a head when a former |
3007 | * event isn't done writing. However since we need to deal with NMIs we |
3008 | * cannot fully serialize things. |
3009 | * |
3010 | * We only publish the head (and generate a wakeup) when the outer-most |
3011 | * event completes. |
3012 | */ |
3013 | static void perf_output_get_handle(struct perf_output_handle *handle) |
3014 | { |
3015 | struct perf_mmap_data *data = handle->data; |
3016 | |
3017 | preempt_disable(); |
3018 | local_inc(&data->nest); |
3019 | handle->wakeup = local_read(&data->wakeup); |
3020 | } |
3021 | |
3022 | static void perf_output_put_handle(struct perf_output_handle *handle) |
3023 | { |
3024 | struct perf_mmap_data *data = handle->data; |
3025 | unsigned long head; |
3026 | |
3027 | again: |
3028 | head = local_read(&data->head); |
3029 | |
3030 | /* |
3031 | * IRQ/NMI can happen here, which means we can miss a head update. |
3032 | */ |
3033 | |
3034 | if (!local_dec_and_test(&data->nest)) |
3035 | goto out; |
3036 | |
3037 | /* |
3038 | * Publish the known good head. Rely on the full barrier implied |
3039 | * by atomic_dec_and_test() order the data->head read and this |
3040 | * write. |
3041 | */ |
3042 | data->user_page->data_head = head; |
3043 | |
3044 | /* |
3045 | * Now check if we missed an update, rely on the (compiler) |
3046 | * barrier in atomic_dec_and_test() to re-read data->head. |
3047 | */ |
3048 | if (unlikely(head != local_read(&data->head))) { |
3049 | local_inc(&data->nest); |
3050 | goto again; |
3051 | } |
3052 | |
3053 | if (handle->wakeup != local_read(&data->wakeup)) |
3054 | perf_output_wakeup(handle); |
3055 | |
3056 | out: |
3057 | preempt_enable(); |
3058 | } |
3059 | |
3060 | __always_inline void perf_output_copy(struct perf_output_handle *handle, |
3061 | const void *buf, unsigned int len) |
3062 | { |
3063 | do { |
3064 | unsigned long size = min_t(unsigned long, handle->size, len); |
3065 | |
3066 | memcpy(handle->addr, buf, size); |
3067 | |
3068 | len -= size; |
3069 | handle->addr += size; |
3070 | buf += size; |
3071 | handle->size -= size; |
3072 | if (!handle->size) { |
3073 | struct perf_mmap_data *data = handle->data; |
3074 | |
3075 | handle->page++; |
3076 | handle->page &= data->nr_pages - 1; |
3077 | handle->addr = data->data_pages[handle->page]; |
3078 | handle->size = PAGE_SIZE << page_order(data); |
3079 | } |
3080 | } while (len); |
3081 | } |
3082 | |
3083 | int perf_output_begin(struct perf_output_handle *handle, |
3084 | struct perf_event *event, unsigned int size, |
3085 | int nmi, int sample) |
3086 | { |
3087 | struct perf_mmap_data *data; |
3088 | unsigned long tail, offset, head; |
3089 | int have_lost; |
3090 | struct { |
3091 | struct perf_event_header header; |
3092 | u64 id; |
3093 | u64 lost; |
3094 | } lost_event; |
3095 | |
3096 | rcu_read_lock(); |
3097 | /* |
3098 | * For inherited events we send all the output towards the parent. |
3099 | */ |
3100 | if (event->parent) |
3101 | event = event->parent; |
3102 | |
3103 | data = rcu_dereference(event->data); |
3104 | if (!data) |
3105 | goto out; |
3106 | |
3107 | handle->data = data; |
3108 | handle->event = event; |
3109 | handle->nmi = nmi; |
3110 | handle->sample = sample; |
3111 | |
3112 | if (!data->nr_pages) |
3113 | goto out; |
3114 | |
3115 | have_lost = local_read(&data->lost); |
3116 | if (have_lost) |
3117 | size += sizeof(lost_event); |
3118 | |
3119 | perf_output_get_handle(handle); |
3120 | |
3121 | do { |
3122 | /* |
3123 | * Userspace could choose to issue a mb() before updating the |
3124 | * tail pointer. So that all reads will be completed before the |
3125 | * write is issued. |
3126 | */ |
3127 | tail = ACCESS_ONCE(data->user_page->data_tail); |
3128 | smp_rmb(); |
3129 | offset = head = local_read(&data->head); |
3130 | head += size; |
3131 | if (unlikely(!perf_output_space(data, tail, offset, head))) |
3132 | goto fail; |
3133 | } while (local_cmpxchg(&data->head, offset, head) != offset); |
3134 | |
3135 | if (head - local_read(&data->wakeup) > data->watermark) |
3136 | local_add(data->watermark, &data->wakeup); |
3137 | |
3138 | handle->page = offset >> (PAGE_SHIFT + page_order(data)); |
3139 | handle->page &= data->nr_pages - 1; |
3140 | handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); |
3141 | handle->addr = data->data_pages[handle->page]; |
3142 | handle->addr += handle->size; |
3143 | handle->size = (PAGE_SIZE << page_order(data)) - handle->size; |
3144 | |
3145 | if (have_lost) { |
3146 | lost_event.header.type = PERF_RECORD_LOST; |
3147 | lost_event.header.misc = 0; |
3148 | lost_event.header.size = sizeof(lost_event); |
3149 | lost_event.id = event->id; |
3150 | lost_event.lost = local_xchg(&data->lost, 0); |
3151 | |
3152 | perf_output_put(handle, lost_event); |
3153 | } |
3154 | |
3155 | return 0; |
3156 | |
3157 | fail: |
3158 | local_inc(&data->lost); |
3159 | perf_output_put_handle(handle); |
3160 | out: |
3161 | rcu_read_unlock(); |
3162 | |
3163 | return -ENOSPC; |
3164 | } |
3165 | |
3166 | void perf_output_end(struct perf_output_handle *handle) |
3167 | { |
3168 | struct perf_event *event = handle->event; |
3169 | struct perf_mmap_data *data = handle->data; |
3170 | |
3171 | int wakeup_events = event->attr.wakeup_events; |
3172 | |
3173 | if (handle->sample && wakeup_events) { |
3174 | int events = local_inc_return(&data->events); |
3175 | if (events >= wakeup_events) { |
3176 | local_sub(wakeup_events, &data->events); |
3177 | local_inc(&data->wakeup); |
3178 | } |
3179 | } |
3180 | |
3181 | perf_output_put_handle(handle); |
3182 | rcu_read_unlock(); |
3183 | } |
3184 | |
3185 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) |
3186 | { |
3187 | /* |
3188 | * only top level events have the pid namespace they were created in |
3189 | */ |
3190 | if (event->parent) |
3191 | event = event->parent; |
3192 | |
3193 | return task_tgid_nr_ns(p, event->ns); |
3194 | } |
3195 | |
3196 | static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) |
3197 | { |
3198 | /* |
3199 | * only top level events have the pid namespace they were created in |
3200 | */ |
3201 | if (event->parent) |
3202 | event = event->parent; |
3203 | |
3204 | return task_pid_nr_ns(p, event->ns); |
3205 | } |
3206 | |
3207 | static void perf_output_read_one(struct perf_output_handle *handle, |
3208 | struct perf_event *event) |
3209 | { |
3210 | u64 read_format = event->attr.read_format; |
3211 | u64 values[4]; |
3212 | int n = 0; |
3213 | |
3214 | values[n++] = atomic64_read(&event->count); |
3215 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { |
3216 | values[n++] = event->total_time_enabled + |
3217 | atomic64_read(&event->child_total_time_enabled); |
3218 | } |
3219 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { |
3220 | values[n++] = event->total_time_running + |
3221 | atomic64_read(&event->child_total_time_running); |
3222 | } |
3223 | if (read_format & PERF_FORMAT_ID) |
3224 | values[n++] = primary_event_id(event); |
3225 | |
3226 | perf_output_copy(handle, values, n * sizeof(u64)); |
3227 | } |
3228 | |
3229 | /* |
3230 | * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. |
3231 | */ |
3232 | static void perf_output_read_group(struct perf_output_handle *handle, |
3233 | struct perf_event *event) |
3234 | { |
3235 | struct perf_event *leader = event->group_leader, *sub; |
3236 | u64 read_format = event->attr.read_format; |
3237 | u64 values[5]; |
3238 | int n = 0; |
3239 | |
3240 | values[n++] = 1 + leader->nr_siblings; |
3241 | |
3242 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) |
3243 | values[n++] = leader->total_time_enabled; |
3244 | |
3245 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) |
3246 | values[n++] = leader->total_time_running; |
3247 | |
3248 | if (leader != event) |
3249 | leader->pmu->read(leader); |
3250 | |
3251 | values[n++] = atomic64_read(&leader->count); |
3252 | if (read_format & PERF_FORMAT_ID) |
3253 | values[n++] = primary_event_id(leader); |
3254 | |
3255 | perf_output_copy(handle, values, n * sizeof(u64)); |
3256 | |
3257 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
3258 | n = 0; |
3259 | |
3260 | if (sub != event) |
3261 | sub->pmu->read(sub); |
3262 | |
3263 | values[n++] = atomic64_read(&sub->count); |
3264 | if (read_format & PERF_FORMAT_ID) |
3265 | values[n++] = primary_event_id(sub); |
3266 | |
3267 | perf_output_copy(handle, values, n * sizeof(u64)); |
3268 | } |
3269 | } |
3270 | |
3271 | static void perf_output_read(struct perf_output_handle *handle, |
3272 | struct perf_event *event) |
3273 | { |
3274 | if (event->attr.read_format & PERF_FORMAT_GROUP) |
3275 | perf_output_read_group(handle, event); |
3276 | else |
3277 | perf_output_read_one(handle, event); |
3278 | } |
3279 | |
3280 | void perf_output_sample(struct perf_output_handle *handle, |
3281 | struct perf_event_header *header, |
3282 | struct perf_sample_data *data, |
3283 | struct perf_event *event) |
3284 | { |
3285 | u64 sample_type = data->type; |
3286 | |
3287 | perf_output_put(handle, *header); |
3288 | |
3289 | if (sample_type & PERF_SAMPLE_IP) |
3290 | perf_output_put(handle, data->ip); |
3291 | |
3292 | if (sample_type & PERF_SAMPLE_TID) |
3293 | perf_output_put(handle, data->tid_entry); |
3294 | |
3295 | if (sample_type & PERF_SAMPLE_TIME) |
3296 | perf_output_put(handle, data->time); |
3297 | |
3298 | if (sample_type & PERF_SAMPLE_ADDR) |
3299 | perf_output_put(handle, data->addr); |
3300 | |
3301 | if (sample_type & PERF_SAMPLE_ID) |
3302 | perf_output_put(handle, data->id); |
3303 | |
3304 | if (sample_type & PERF_SAMPLE_STREAM_ID) |
3305 | perf_output_put(handle, data->stream_id); |
3306 | |
3307 | if (sample_type & PERF_SAMPLE_CPU) |
3308 | perf_output_put(handle, data->cpu_entry); |
3309 | |
3310 | if (sample_type & PERF_SAMPLE_PERIOD) |
3311 | perf_output_put(handle, data->period); |
3312 | |
3313 | if (sample_type & PERF_SAMPLE_READ) |
3314 | perf_output_read(handle, event); |
3315 | |
3316 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { |
3317 | if (data->callchain) { |
3318 | int size = 1; |
3319 | |
3320 | if (data->callchain) |
3321 | size += data->callchain->nr; |
3322 | |
3323 | size *= sizeof(u64); |
3324 | |
3325 | perf_output_copy(handle, data->callchain, size); |
3326 | } else { |
3327 | u64 nr = 0; |
3328 | perf_output_put(handle, nr); |
3329 | } |
3330 | } |
3331 | |
3332 | if (sample_type & PERF_SAMPLE_RAW) { |
3333 | if (data->raw) { |
3334 | perf_output_put(handle, data->raw->size); |
3335 | perf_output_copy(handle, data->raw->data, |
3336 | data->raw->size); |
3337 | } else { |
3338 | struct { |
3339 | u32 size; |
3340 | u32 data; |
3341 | } raw = { |
3342 | .size = sizeof(u32), |
3343 | .data = 0, |
3344 | }; |
3345 | perf_output_put(handle, raw); |
3346 | } |
3347 | } |
3348 | } |
3349 | |
3350 | void perf_prepare_sample(struct perf_event_header *header, |
3351 | struct perf_sample_data *data, |
3352 | struct perf_event *event, |
3353 | struct pt_regs *regs) |
3354 | { |
3355 | u64 sample_type = event->attr.sample_type; |
3356 | |
3357 | data->type = sample_type; |
3358 | |
3359 | header->type = PERF_RECORD_SAMPLE; |
3360 | header->size = sizeof(*header); |
3361 | |
3362 | header->misc = 0; |
3363 | header->misc |= perf_misc_flags(regs); |
3364 | |
3365 | if (sample_type & PERF_SAMPLE_IP) { |
3366 | data->ip = perf_instruction_pointer(regs); |
3367 | |
3368 | header->size += sizeof(data->ip); |
3369 | } |
3370 | |
3371 | if (sample_type & PERF_SAMPLE_TID) { |
3372 | /* namespace issues */ |
3373 | data->tid_entry.pid = perf_event_pid(event, current); |
3374 | data->tid_entry.tid = perf_event_tid(event, current); |
3375 | |
3376 | header->size += sizeof(data->tid_entry); |
3377 | } |
3378 | |
3379 | if (sample_type & PERF_SAMPLE_TIME) { |
3380 | data->time = perf_clock(); |
3381 | |
3382 | header->size += sizeof(data->time); |
3383 | } |
3384 | |
3385 | if (sample_type & PERF_SAMPLE_ADDR) |
3386 | header->size += sizeof(data->addr); |
3387 | |
3388 | if (sample_type & PERF_SAMPLE_ID) { |
3389 | data->id = primary_event_id(event); |
3390 | |
3391 | header->size += sizeof(data->id); |
3392 | } |
3393 | |
3394 | if (sample_type & PERF_SAMPLE_STREAM_ID) { |
3395 | data->stream_id = event->id; |
3396 | |
3397 | header->size += sizeof(data->stream_id); |
3398 | } |
3399 | |
3400 | if (sample_type & PERF_SAMPLE_CPU) { |
3401 | data->cpu_entry.cpu = raw_smp_processor_id(); |
3402 | data->cpu_entry.reserved = 0; |
3403 | |
3404 | header->size += sizeof(data->cpu_entry); |
3405 | } |
3406 | |
3407 | if (sample_type & PERF_SAMPLE_PERIOD) |
3408 | header->size += sizeof(data->period); |
3409 | |
3410 | if (sample_type & PERF_SAMPLE_READ) |
3411 | header->size += perf_event_read_size(event); |
3412 | |
3413 | if (sample_type & PERF_SAMPLE_CALLCHAIN) { |
3414 | int size = 1; |
3415 | |
3416 | data->callchain = perf_callchain(regs); |
3417 | |
3418 | if (data->callchain) |
3419 | size += data->callchain->nr; |
3420 | |
3421 | header->size += size * sizeof(u64); |
3422 | } |
3423 | |
3424 | if (sample_type & PERF_SAMPLE_RAW) { |
3425 | int size = sizeof(u32); |
3426 | |
3427 | if (data->raw) |
3428 | size += data->raw->size; |
3429 | else |
3430 | size += sizeof(u32); |
3431 | |
3432 | WARN_ON_ONCE(size & (sizeof(u64)-1)); |
3433 | header->size += size; |
3434 | } |
3435 | } |
3436 | |
3437 | static void perf_event_output(struct perf_event *event, int nmi, |
3438 | struct perf_sample_data *data, |
3439 | struct pt_regs *regs) |
3440 | { |
3441 | struct perf_output_handle handle; |
3442 | struct perf_event_header header; |
3443 | |
3444 | perf_prepare_sample(&header, data, event, regs); |
3445 | |
3446 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) |
3447 | return; |
3448 | |
3449 | perf_output_sample(&handle, &header, data, event); |
3450 | |
3451 | perf_output_end(&handle); |
3452 | } |
3453 | |
3454 | /* |
3455 | * read event_id |
3456 | */ |
3457 | |
3458 | struct perf_read_event { |
3459 | struct perf_event_header header; |
3460 | |
3461 | u32 pid; |
3462 | u32 tid; |
3463 | }; |
3464 | |
3465 | static void |
3466 | perf_event_read_event(struct perf_event *event, |
3467 | struct task_struct *task) |
3468 | { |
3469 | struct perf_output_handle handle; |
3470 | struct perf_read_event read_event = { |
3471 | .header = { |
3472 | .type = PERF_RECORD_READ, |
3473 | .misc = 0, |
3474 | .size = sizeof(read_event) + perf_event_read_size(event), |
3475 | }, |
3476 | .pid = perf_event_pid(event, task), |
3477 | .tid = perf_event_tid(event, task), |
3478 | }; |
3479 | int ret; |
3480 | |
3481 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); |
3482 | if (ret) |
3483 | return; |
3484 | |
3485 | perf_output_put(&handle, read_event); |
3486 | perf_output_read(&handle, event); |
3487 | |
3488 | perf_output_end(&handle); |
3489 | } |
3490 | |
3491 | /* |
3492 | * task tracking -- fork/exit |
3493 | * |
3494 | * enabled by: attr.comm | attr.mmap | attr.task |
3495 | */ |
3496 | |
3497 | struct perf_task_event { |
3498 | struct task_struct *task; |
3499 | struct perf_event_context *task_ctx; |
3500 | |
3501 | struct { |
3502 | struct perf_event_header header; |
3503 | |
3504 | u32 pid; |
3505 | u32 ppid; |
3506 | u32 tid; |
3507 | u32 ptid; |
3508 | u64 time; |
3509 | } event_id; |
3510 | }; |
3511 | |
3512 | static void perf_event_task_output(struct perf_event *event, |
3513 | struct perf_task_event *task_event) |
3514 | { |
3515 | struct perf_output_handle handle; |
3516 | struct task_struct *task = task_event->task; |
3517 | int size, ret; |
3518 | |
3519 | size = task_event->event_id.header.size; |
3520 | ret = perf_output_begin(&handle, event, size, 0, 0); |
3521 | |
3522 | if (ret) |
3523 | return; |
3524 | |
3525 | task_event->event_id.pid = perf_event_pid(event, task); |
3526 | task_event->event_id.ppid = perf_event_pid(event, current); |
3527 | |
3528 | task_event->event_id.tid = perf_event_tid(event, task); |
3529 | task_event->event_id.ptid = perf_event_tid(event, current); |
3530 | |
3531 | perf_output_put(&handle, task_event->event_id); |
3532 | |
3533 | perf_output_end(&handle); |
3534 | } |
3535 | |
3536 | static int perf_event_task_match(struct perf_event *event) |
3537 | { |
3538 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3539 | return 0; |
3540 | |
3541 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
3542 | return 0; |
3543 | |
3544 | if (event->attr.comm || event->attr.mmap || event->attr.task) |
3545 | return 1; |
3546 | |
3547 | return 0; |
3548 | } |
3549 | |
3550 | static void perf_event_task_ctx(struct perf_event_context *ctx, |
3551 | struct perf_task_event *task_event) |
3552 | { |
3553 | struct perf_event *event; |
3554 | |
3555 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
3556 | if (perf_event_task_match(event)) |
3557 | perf_event_task_output(event, task_event); |
3558 | } |
3559 | } |
3560 | |
3561 | static void perf_event_task_event(struct perf_task_event *task_event) |
3562 | { |
3563 | struct perf_cpu_context *cpuctx; |
3564 | struct perf_event_context *ctx = task_event->task_ctx; |
3565 | |
3566 | rcu_read_lock(); |
3567 | cpuctx = &get_cpu_var(perf_cpu_context); |
3568 | perf_event_task_ctx(&cpuctx->ctx, task_event); |
3569 | if (!ctx) |
3570 | ctx = rcu_dereference(current->perf_event_ctxp); |
3571 | if (ctx) |
3572 | perf_event_task_ctx(ctx, task_event); |
3573 | put_cpu_var(perf_cpu_context); |
3574 | rcu_read_unlock(); |
3575 | } |
3576 | |
3577 | static void perf_event_task(struct task_struct *task, |
3578 | struct perf_event_context *task_ctx, |
3579 | int new) |
3580 | { |
3581 | struct perf_task_event task_event; |
3582 | |
3583 | if (!atomic_read(&nr_comm_events) && |
3584 | !atomic_read(&nr_mmap_events) && |
3585 | !atomic_read(&nr_task_events)) |
3586 | return; |
3587 | |
3588 | task_event = (struct perf_task_event){ |
3589 | .task = task, |
3590 | .task_ctx = task_ctx, |
3591 | .event_id = { |
3592 | .header = { |
3593 | .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, |
3594 | .misc = 0, |
3595 | .size = sizeof(task_event.event_id), |
3596 | }, |
3597 | /* .pid */ |
3598 | /* .ppid */ |
3599 | /* .tid */ |
3600 | /* .ptid */ |
3601 | .time = perf_clock(), |
3602 | }, |
3603 | }; |
3604 | |
3605 | perf_event_task_event(&task_event); |
3606 | } |
3607 | |
3608 | void perf_event_fork(struct task_struct *task) |
3609 | { |
3610 | perf_event_task(task, NULL, 1); |
3611 | } |
3612 | |
3613 | /* |
3614 | * comm tracking |
3615 | */ |
3616 | |
3617 | struct perf_comm_event { |
3618 | struct task_struct *task; |
3619 | char *comm; |
3620 | int comm_size; |
3621 | |
3622 | struct { |
3623 | struct perf_event_header header; |
3624 | |
3625 | u32 pid; |
3626 | u32 tid; |
3627 | } event_id; |
3628 | }; |
3629 | |
3630 | static void perf_event_comm_output(struct perf_event *event, |
3631 | struct perf_comm_event *comm_event) |
3632 | { |
3633 | struct perf_output_handle handle; |
3634 | int size = comm_event->event_id.header.size; |
3635 | int ret = perf_output_begin(&handle, event, size, 0, 0); |
3636 | |
3637 | if (ret) |
3638 | return; |
3639 | |
3640 | comm_event->event_id.pid = perf_event_pid(event, comm_event->task); |
3641 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); |
3642 | |
3643 | perf_output_put(&handle, comm_event->event_id); |
3644 | perf_output_copy(&handle, comm_event->comm, |
3645 | comm_event->comm_size); |
3646 | perf_output_end(&handle); |
3647 | } |
3648 | |
3649 | static int perf_event_comm_match(struct perf_event *event) |
3650 | { |
3651 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3652 | return 0; |
3653 | |
3654 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
3655 | return 0; |
3656 | |
3657 | if (event->attr.comm) |
3658 | return 1; |
3659 | |
3660 | return 0; |
3661 | } |
3662 | |
3663 | static void perf_event_comm_ctx(struct perf_event_context *ctx, |
3664 | struct perf_comm_event *comm_event) |
3665 | { |
3666 | struct perf_event *event; |
3667 | |
3668 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
3669 | if (perf_event_comm_match(event)) |
3670 | perf_event_comm_output(event, comm_event); |
3671 | } |
3672 | } |
3673 | |
3674 | static void perf_event_comm_event(struct perf_comm_event *comm_event) |
3675 | { |
3676 | struct perf_cpu_context *cpuctx; |
3677 | struct perf_event_context *ctx; |
3678 | unsigned int size; |
3679 | char comm[TASK_COMM_LEN]; |
3680 | |
3681 | memset(comm, 0, sizeof(comm)); |
3682 | strlcpy(comm, comm_event->task->comm, sizeof(comm)); |
3683 | size = ALIGN(strlen(comm)+1, sizeof(u64)); |
3684 | |
3685 | comm_event->comm = comm; |
3686 | comm_event->comm_size = size; |
3687 | |
3688 | comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; |
3689 | |
3690 | rcu_read_lock(); |
3691 | cpuctx = &get_cpu_var(perf_cpu_context); |
3692 | perf_event_comm_ctx(&cpuctx->ctx, comm_event); |
3693 | ctx = rcu_dereference(current->perf_event_ctxp); |
3694 | if (ctx) |
3695 | perf_event_comm_ctx(ctx, comm_event); |
3696 | put_cpu_var(perf_cpu_context); |
3697 | rcu_read_unlock(); |
3698 | } |
3699 | |
3700 | void perf_event_comm(struct task_struct *task) |
3701 | { |
3702 | struct perf_comm_event comm_event; |
3703 | |
3704 | if (task->perf_event_ctxp) |
3705 | perf_event_enable_on_exec(task); |
3706 | |
3707 | if (!atomic_read(&nr_comm_events)) |
3708 | return; |
3709 | |
3710 | comm_event = (struct perf_comm_event){ |
3711 | .task = task, |
3712 | /* .comm */ |
3713 | /* .comm_size */ |
3714 | .event_id = { |
3715 | .header = { |
3716 | .type = PERF_RECORD_COMM, |
3717 | .misc = 0, |
3718 | /* .size */ |
3719 | }, |
3720 | /* .pid */ |
3721 | /* .tid */ |
3722 | }, |
3723 | }; |
3724 | |
3725 | perf_event_comm_event(&comm_event); |
3726 | } |
3727 | |
3728 | /* |
3729 | * mmap tracking |
3730 | */ |
3731 | |
3732 | struct perf_mmap_event { |
3733 | struct vm_area_struct *vma; |
3734 | |
3735 | const char *file_name; |
3736 | int file_size; |
3737 | |
3738 | struct { |
3739 | struct perf_event_header header; |
3740 | |
3741 | u32 pid; |
3742 | u32 tid; |
3743 | u64 start; |
3744 | u64 len; |
3745 | u64 pgoff; |
3746 | } event_id; |
3747 | }; |
3748 | |
3749 | static void perf_event_mmap_output(struct perf_event *event, |
3750 | struct perf_mmap_event *mmap_event) |
3751 | { |
3752 | struct perf_output_handle handle; |
3753 | int size = mmap_event->event_id.header.size; |
3754 | int ret = perf_output_begin(&handle, event, size, 0, 0); |
3755 | |
3756 | if (ret) |
3757 | return; |
3758 | |
3759 | mmap_event->event_id.pid = perf_event_pid(event, current); |
3760 | mmap_event->event_id.tid = perf_event_tid(event, current); |
3761 | |
3762 | perf_output_put(&handle, mmap_event->event_id); |
3763 | perf_output_copy(&handle, mmap_event->file_name, |
3764 | mmap_event->file_size); |
3765 | perf_output_end(&handle); |
3766 | } |
3767 | |
3768 | static int perf_event_mmap_match(struct perf_event *event, |
3769 | struct perf_mmap_event *mmap_event) |
3770 | { |
3771 | if (event->state < PERF_EVENT_STATE_INACTIVE) |
3772 | return 0; |
3773 | |
3774 | if (event->cpu != -1 && event->cpu != smp_processor_id()) |
3775 | return 0; |
3776 | |
3777 | if (event->attr.mmap) |
3778 | return 1; |
3779 | |
3780 | return 0; |
3781 | } |
3782 | |
3783 | static void perf_event_mmap_ctx(struct perf_event_context *ctx, |
3784 | struct perf_mmap_event *mmap_event) |
3785 | { |
3786 | struct perf_event *event; |
3787 | |
3788 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
3789 | if (perf_event_mmap_match(event, mmap_event)) |
3790 | perf_event_mmap_output(event, mmap_event); |
3791 | } |
3792 | } |
3793 | |
3794 | static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) |
3795 | { |
3796 | struct perf_cpu_context *cpuctx; |
3797 | struct perf_event_context *ctx; |
3798 | struct vm_area_struct *vma = mmap_event->vma; |
3799 | struct file *file = vma->vm_file; |
3800 | unsigned int size; |
3801 | char tmp[16]; |
3802 | char *buf = NULL; |
3803 | const char *name; |
3804 | |
3805 | memset(tmp, 0, sizeof(tmp)); |
3806 | |
3807 | if (file) { |
3808 | /* |
3809 | * d_path works from the end of the buffer backwards, so we |
3810 | * need to add enough zero bytes after the string to handle |
3811 | * the 64bit alignment we do later. |
3812 | */ |
3813 | buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); |
3814 | if (!buf) { |
3815 | name = strncpy(tmp, "//enomem", sizeof(tmp)); |
3816 | goto got_name; |
3817 | } |
3818 | name = d_path(&file->f_path, buf, PATH_MAX); |
3819 | if (IS_ERR(name)) { |
3820 | name = strncpy(tmp, "//toolong", sizeof(tmp)); |
3821 | goto got_name; |
3822 | } |
3823 | } else { |
3824 | if (arch_vma_name(mmap_event->vma)) { |
3825 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), |
3826 | sizeof(tmp)); |
3827 | goto got_name; |
3828 | } |
3829 | |
3830 | if (!vma->vm_mm) { |
3831 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); |
3832 | goto got_name; |
3833 | } |
3834 | |
3835 | name = strncpy(tmp, "//anon", sizeof(tmp)); |
3836 | goto got_name; |
3837 | } |
3838 | |
3839 | got_name: |
3840 | size = ALIGN(strlen(name)+1, sizeof(u64)); |
3841 | |
3842 | mmap_event->file_name = name; |
3843 | mmap_event->file_size = size; |
3844 | |
3845 | mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; |
3846 | |
3847 | rcu_read_lock(); |
3848 | cpuctx = &get_cpu_var(perf_cpu_context); |
3849 | perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); |
3850 | ctx = rcu_dereference(current->perf_event_ctxp); |
3851 | if (ctx) |
3852 | perf_event_mmap_ctx(ctx, mmap_event); |
3853 | put_cpu_var(perf_cpu_context); |
3854 | rcu_read_unlock(); |
3855 | |
3856 | kfree(buf); |
3857 | } |
3858 | |
3859 | void __perf_event_mmap(struct vm_area_struct *vma) |
3860 | { |
3861 | struct perf_mmap_event mmap_event; |
3862 | |
3863 | if (!atomic_read(&nr_mmap_events)) |
3864 | return; |
3865 | |
3866 | mmap_event = (struct perf_mmap_event){ |
3867 | .vma = vma, |
3868 | /* .file_name */ |
3869 | /* .file_size */ |
3870 | .event_id = { |
3871 | .header = { |
3872 | .type = PERF_RECORD_MMAP, |
3873 | .misc = PERF_RECORD_MISC_USER, |
3874 | /* .size */ |
3875 | }, |
3876 | /* .pid */ |
3877 | /* .tid */ |
3878 | .start = vma->vm_start, |
3879 | .len = vma->vm_end - vma->vm_start, |
3880 | .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, |
3881 | }, |
3882 | }; |
3883 | |
3884 | perf_event_mmap_event(&mmap_event); |
3885 | } |
3886 | |
3887 | /* |
3888 | * IRQ throttle logging |
3889 | */ |
3890 | |
3891 | static void perf_log_throttle(struct perf_event *event, int enable) |
3892 | { |
3893 | struct perf_output_handle handle; |
3894 | int ret; |
3895 | |
3896 | struct { |
3897 | struct perf_event_header header; |
3898 | u64 time; |
3899 | u64 id; |
3900 | u64 stream_id; |
3901 | } throttle_event = { |
3902 | .header = { |
3903 | .type = PERF_RECORD_THROTTLE, |
3904 | .misc = 0, |
3905 | .size = sizeof(throttle_event), |
3906 | }, |
3907 | .time = perf_clock(), |
3908 | .id = primary_event_id(event), |
3909 | .stream_id = event->id, |
3910 | }; |
3911 | |
3912 | if (enable) |
3913 | throttle_event.header.type = PERF_RECORD_UNTHROTTLE; |
3914 | |
3915 | ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); |
3916 | if (ret) |
3917 | return; |
3918 | |
3919 | perf_output_put(&handle, throttle_event); |
3920 | perf_output_end(&handle); |
3921 | } |
3922 | |
3923 | /* |
3924 | * Generic event overflow handling, sampling. |
3925 | */ |
3926 | |
3927 | static int __perf_event_overflow(struct perf_event *event, int nmi, |
3928 | int throttle, struct perf_sample_data *data, |
3929 | struct pt_regs *regs) |
3930 | { |
3931 | int events = atomic_read(&event->event_limit); |
3932 | struct hw_perf_event *hwc = &event->hw; |
3933 | int ret = 0; |
3934 | |
3935 | throttle = (throttle && event->pmu->unthrottle != NULL); |
3936 | |
3937 | if (!throttle) { |
3938 | hwc->interrupts++; |
3939 | } else { |
3940 | if (hwc->interrupts != MAX_INTERRUPTS) { |
3941 | hwc->interrupts++; |
3942 | if (HZ * hwc->interrupts > |
3943 | (u64)sysctl_perf_event_sample_rate) { |
3944 | hwc->interrupts = MAX_INTERRUPTS; |
3945 | perf_log_throttle(event, 0); |
3946 | ret = 1; |
3947 | } |
3948 | } else { |
3949 | /* |
3950 | * Keep re-disabling events even though on the previous |
3951 | * pass we disabled it - just in case we raced with a |
3952 | * sched-in and the event got enabled again: |
3953 | */ |
3954 | ret = 1; |
3955 | } |
3956 | } |
3957 | |
3958 | if (event->attr.freq) { |
3959 | u64 now = perf_clock(); |
3960 | s64 delta = now - hwc->freq_time_stamp; |
3961 | |
3962 | hwc->freq_time_stamp = now; |
3963 | |
3964 | if (delta > 0 && delta < 2*TICK_NSEC) |
3965 | perf_adjust_period(event, delta, hwc->last_period); |
3966 | } |
3967 | |
3968 | /* |
3969 | * XXX event_limit might not quite work as expected on inherited |
3970 | * events |
3971 | */ |
3972 | |
3973 | event->pending_kill = POLL_IN; |
3974 | if (events && atomic_dec_and_test(&event->event_limit)) { |
3975 | ret = 1; |
3976 | event->pending_kill = POLL_HUP; |
3977 | if (nmi) { |
3978 | event->pending_disable = 1; |
3979 | perf_pending_queue(&event->pending, |
3980 | perf_pending_event); |
3981 | } else |
3982 | perf_event_disable(event); |
3983 | } |
3984 | |
3985 | if (event->overflow_handler) |
3986 | event->overflow_handler(event, nmi, data, regs); |
3987 | else |
3988 | perf_event_output(event, nmi, data, regs); |
3989 | |
3990 | return ret; |
3991 | } |
3992 | |
3993 | int perf_event_overflow(struct perf_event *event, int nmi, |
3994 | struct perf_sample_data *data, |
3995 | struct pt_regs *regs) |
3996 | { |
3997 | return __perf_event_overflow(event, nmi, 1, data, regs); |
3998 | } |
3999 | |
4000 | /* |
4001 | * Generic software event infrastructure |
4002 | */ |
4003 | |
4004 | /* |
4005 | * We directly increment event->count and keep a second value in |
4006 | * event->hw.period_left to count intervals. This period event |
4007 | * is kept in the range [-sample_period, 0] so that we can use the |
4008 | * sign as trigger. |
4009 | */ |
4010 | |
4011 | static u64 perf_swevent_set_period(struct perf_event *event) |
4012 | { |
4013 | struct hw_perf_event *hwc = &event->hw; |
4014 | u64 period = hwc->last_period; |
4015 | u64 nr, offset; |
4016 | s64 old, val; |
4017 | |
4018 | hwc->last_period = hwc->sample_period; |
4019 | |
4020 | again: |
4021 | old = val = atomic64_read(&hwc->period_left); |
4022 | if (val < 0) |
4023 | return 0; |
4024 | |
4025 | nr = div64_u64(period + val, period); |
4026 | offset = nr * period; |
4027 | val -= offset; |
4028 | if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) |
4029 | goto again; |
4030 | |
4031 | return nr; |
4032 | } |
4033 | |
4034 | static void perf_swevent_overflow(struct perf_event *event, u64 overflow, |
4035 | int nmi, struct perf_sample_data *data, |
4036 | struct pt_regs *regs) |
4037 | { |
4038 | struct hw_perf_event *hwc = &event->hw; |
4039 | int throttle = 0; |
4040 | |
4041 | data->period = event->hw.last_period; |
4042 | if (!overflow) |
4043 | overflow = perf_swevent_set_period(event); |
4044 | |
4045 | if (hwc->interrupts == MAX_INTERRUPTS) |
4046 | return; |
4047 | |
4048 | for (; overflow; overflow--) { |
4049 | if (__perf_event_overflow(event, nmi, throttle, |
4050 | data, regs)) { |
4051 | /* |
4052 | * We inhibit the overflow from happening when |
4053 | * hwc->interrupts == MAX_INTERRUPTS. |
4054 | */ |
4055 | break; |
4056 | } |
4057 | throttle = 1; |
4058 | } |
4059 | } |
4060 | |
4061 | static void perf_swevent_add(struct perf_event *event, u64 nr, |
4062 | int nmi, struct perf_sample_data *data, |
4063 | struct pt_regs *regs) |
4064 | { |
4065 | struct hw_perf_event *hwc = &event->hw; |
4066 | |
4067 | atomic64_add(nr, &event->count); |
4068 | |
4069 | if (!regs) |
4070 | return; |
4071 | |
4072 | if (!hwc->sample_period) |
4073 | return; |
4074 | |
4075 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
4076 | return perf_swevent_overflow(event, 1, nmi, data, regs); |
4077 | |
4078 | if (atomic64_add_negative(nr, &hwc->period_left)) |
4079 | return; |
4080 | |
4081 | perf_swevent_overflow(event, 0, nmi, data, regs); |
4082 | } |
4083 | |
4084 | static int perf_exclude_event(struct perf_event *event, |
4085 | struct pt_regs *regs) |
4086 | { |
4087 | if (regs) { |
4088 | if (event->attr.exclude_user && user_mode(regs)) |
4089 | return 1; |
4090 | |
4091 | if (event->attr.exclude_kernel && !user_mode(regs)) |
4092 | return 1; |
4093 | } |
4094 | |
4095 | return 0; |
4096 | } |
4097 | |
4098 | static int perf_swevent_match(struct perf_event *event, |
4099 | enum perf_type_id type, |
4100 | u32 event_id, |
4101 | struct perf_sample_data *data, |
4102 | struct pt_regs *regs) |
4103 | { |
4104 | if (event->attr.type != type) |
4105 | return 0; |
4106 | |
4107 | if (event->attr.config != event_id) |
4108 | return 0; |
4109 | |
4110 | if (perf_exclude_event(event, regs)) |
4111 | return 0; |
4112 | |
4113 | return 1; |
4114 | } |
4115 | |
4116 | static inline u64 swevent_hash(u64 type, u32 event_id) |
4117 | { |
4118 | u64 val = event_id | (type << 32); |
4119 | |
4120 | return hash_64(val, SWEVENT_HLIST_BITS); |
4121 | } |
4122 | |
4123 | static inline struct hlist_head * |
4124 | __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) |
4125 | { |
4126 | u64 hash = swevent_hash(type, event_id); |
4127 | |
4128 | return &hlist->heads[hash]; |
4129 | } |
4130 | |
4131 | /* For the read side: events when they trigger */ |
4132 | static inline struct hlist_head * |
4133 | find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) |
4134 | { |
4135 | struct swevent_hlist *hlist; |
4136 | |
4137 | hlist = rcu_dereference(ctx->swevent_hlist); |
4138 | if (!hlist) |
4139 | return NULL; |
4140 | |
4141 | return __find_swevent_head(hlist, type, event_id); |
4142 | } |
4143 | |
4144 | /* For the event head insertion and removal in the hlist */ |
4145 | static inline struct hlist_head * |
4146 | find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) |
4147 | { |
4148 | struct swevent_hlist *hlist; |
4149 | u32 event_id = event->attr.config; |
4150 | u64 type = event->attr.type; |
4151 | |
4152 | /* |
4153 | * Event scheduling is always serialized against hlist allocation |
4154 | * and release. Which makes the protected version suitable here. |
4155 | * The context lock guarantees that. |
4156 | */ |
4157 | hlist = rcu_dereference_protected(ctx->swevent_hlist, |
4158 | lockdep_is_held(&event->ctx->lock)); |
4159 | if (!hlist) |
4160 | return NULL; |
4161 | |
4162 | return __find_swevent_head(hlist, type, event_id); |
4163 | } |
4164 | |
4165 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, |
4166 | u64 nr, int nmi, |
4167 | struct perf_sample_data *data, |
4168 | struct pt_regs *regs) |
4169 | { |
4170 | struct perf_cpu_context *cpuctx; |
4171 | struct perf_event *event; |
4172 | struct hlist_node *node; |
4173 | struct hlist_head *head; |
4174 | |
4175 | cpuctx = &__get_cpu_var(perf_cpu_context); |
4176 | |
4177 | rcu_read_lock(); |
4178 | |
4179 | head = find_swevent_head_rcu(cpuctx, type, event_id); |
4180 | |
4181 | if (!head) |
4182 | goto end; |
4183 | |
4184 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4185 | if (perf_swevent_match(event, type, event_id, data, regs)) |
4186 | perf_swevent_add(event, nr, nmi, data, regs); |
4187 | } |
4188 | end: |
4189 | rcu_read_unlock(); |
4190 | } |
4191 | |
4192 | int perf_swevent_get_recursion_context(void) |
4193 | { |
4194 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
4195 | int rctx; |
4196 | |
4197 | if (in_nmi()) |
4198 | rctx = 3; |
4199 | else if (in_irq()) |
4200 | rctx = 2; |
4201 | else if (in_softirq()) |
4202 | rctx = 1; |
4203 | else |
4204 | rctx = 0; |
4205 | |
4206 | if (cpuctx->recursion[rctx]) |
4207 | return -1; |
4208 | |
4209 | cpuctx->recursion[rctx]++; |
4210 | barrier(); |
4211 | |
4212 | return rctx; |
4213 | } |
4214 | EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); |
4215 | |
4216 | void perf_swevent_put_recursion_context(int rctx) |
4217 | { |
4218 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
4219 | barrier(); |
4220 | cpuctx->recursion[rctx]--; |
4221 | } |
4222 | EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); |
4223 | |
4224 | |
4225 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, |
4226 | struct pt_regs *regs, u64 addr) |
4227 | { |
4228 | struct perf_sample_data data; |
4229 | int rctx; |
4230 | |
4231 | preempt_disable_notrace(); |
4232 | rctx = perf_swevent_get_recursion_context(); |
4233 | if (rctx < 0) |
4234 | return; |
4235 | |
4236 | perf_sample_data_init(&data, addr); |
4237 | |
4238 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); |
4239 | |
4240 | perf_swevent_put_recursion_context(rctx); |
4241 | preempt_enable_notrace(); |
4242 | } |
4243 | |
4244 | static void perf_swevent_read(struct perf_event *event) |
4245 | { |
4246 | } |
4247 | |
4248 | static int perf_swevent_enable(struct perf_event *event) |
4249 | { |
4250 | struct hw_perf_event *hwc = &event->hw; |
4251 | struct perf_cpu_context *cpuctx; |
4252 | struct hlist_head *head; |
4253 | |
4254 | cpuctx = &__get_cpu_var(perf_cpu_context); |
4255 | |
4256 | if (hwc->sample_period) { |
4257 | hwc->last_period = hwc->sample_period; |
4258 | perf_swevent_set_period(event); |
4259 | } |
4260 | |
4261 | head = find_swevent_head(cpuctx, event); |
4262 | if (WARN_ON_ONCE(!head)) |
4263 | return -EINVAL; |
4264 | |
4265 | hlist_add_head_rcu(&event->hlist_entry, head); |
4266 | |
4267 | return 0; |
4268 | } |
4269 | |
4270 | static void perf_swevent_disable(struct perf_event *event) |
4271 | { |
4272 | hlist_del_rcu(&event->hlist_entry); |
4273 | } |
4274 | |
4275 | static void perf_swevent_void(struct perf_event *event) |
4276 | { |
4277 | } |
4278 | |
4279 | static int perf_swevent_int(struct perf_event *event) |
4280 | { |
4281 | return 0; |
4282 | } |
4283 | |
4284 | static const struct pmu perf_ops_generic = { |
4285 | .enable = perf_swevent_enable, |
4286 | .disable = perf_swevent_disable, |
4287 | .start = perf_swevent_int, |
4288 | .stop = perf_swevent_void, |
4289 | .read = perf_swevent_read, |
4290 | .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ |
4291 | }; |
4292 | |
4293 | /* |
4294 | * hrtimer based swevent callback |
4295 | */ |
4296 | |
4297 | static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) |
4298 | { |
4299 | enum hrtimer_restart ret = HRTIMER_RESTART; |
4300 | struct perf_sample_data data; |
4301 | struct pt_regs *regs; |
4302 | struct perf_event *event; |
4303 | u64 period; |
4304 | |
4305 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
4306 | event->pmu->read(event); |
4307 | |
4308 | perf_sample_data_init(&data, 0); |
4309 | data.period = event->hw.last_period; |
4310 | regs = get_irq_regs(); |
4311 | |
4312 | if (regs && !perf_exclude_event(event, regs)) { |
4313 | if (!(event->attr.exclude_idle && current->pid == 0)) |
4314 | if (perf_event_overflow(event, 0, &data, regs)) |
4315 | ret = HRTIMER_NORESTART; |
4316 | } |
4317 | |
4318 | period = max_t(u64, 10000, event->hw.sample_period); |
4319 | hrtimer_forward_now(hrtimer, ns_to_ktime(period)); |
4320 | |
4321 | return ret; |
4322 | } |
4323 | |
4324 | static void perf_swevent_start_hrtimer(struct perf_event *event) |
4325 | { |
4326 | struct hw_perf_event *hwc = &event->hw; |
4327 | |
4328 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
4329 | hwc->hrtimer.function = perf_swevent_hrtimer; |
4330 | if (hwc->sample_period) { |
4331 | u64 period; |
4332 | |
4333 | if (hwc->remaining) { |
4334 | if (hwc->remaining < 0) |
4335 | period = 10000; |
4336 | else |
4337 | period = hwc->remaining; |
4338 | hwc->remaining = 0; |
4339 | } else { |
4340 | period = max_t(u64, 10000, hwc->sample_period); |
4341 | } |
4342 | __hrtimer_start_range_ns(&hwc->hrtimer, |
4343 | ns_to_ktime(period), 0, |
4344 | HRTIMER_MODE_REL, 0); |
4345 | } |
4346 | } |
4347 | |
4348 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) |
4349 | { |
4350 | struct hw_perf_event *hwc = &event->hw; |
4351 | |
4352 | if (hwc->sample_period) { |
4353 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); |
4354 | hwc->remaining = ktime_to_ns(remaining); |
4355 | |
4356 | hrtimer_cancel(&hwc->hrtimer); |
4357 | } |
4358 | } |
4359 | |
4360 | /* |
4361 | * Software event: cpu wall time clock |
4362 | */ |
4363 | |
4364 | static void cpu_clock_perf_event_update(struct perf_event *event) |
4365 | { |
4366 | int cpu = raw_smp_processor_id(); |
4367 | s64 prev; |
4368 | u64 now; |
4369 | |
4370 | now = cpu_clock(cpu); |
4371 | prev = atomic64_xchg(&event->hw.prev_count, now); |
4372 | atomic64_add(now - prev, &event->count); |
4373 | } |
4374 | |
4375 | static int cpu_clock_perf_event_enable(struct perf_event *event) |
4376 | { |
4377 | struct hw_perf_event *hwc = &event->hw; |
4378 | int cpu = raw_smp_processor_id(); |
4379 | |
4380 | atomic64_set(&hwc->prev_count, cpu_clock(cpu)); |
4381 | perf_swevent_start_hrtimer(event); |
4382 | |
4383 | return 0; |
4384 | } |
4385 | |
4386 | static void cpu_clock_perf_event_disable(struct perf_event *event) |
4387 | { |
4388 | perf_swevent_cancel_hrtimer(event); |
4389 | cpu_clock_perf_event_update(event); |
4390 | } |
4391 | |
4392 | static void cpu_clock_perf_event_read(struct perf_event *event) |
4393 | { |
4394 | cpu_clock_perf_event_update(event); |
4395 | } |
4396 | |
4397 | static const struct pmu perf_ops_cpu_clock = { |
4398 | .enable = cpu_clock_perf_event_enable, |
4399 | .disable = cpu_clock_perf_event_disable, |
4400 | .read = cpu_clock_perf_event_read, |
4401 | }; |
4402 | |
4403 | /* |
4404 | * Software event: task time clock |
4405 | */ |
4406 | |
4407 | static void task_clock_perf_event_update(struct perf_event *event, u64 now) |
4408 | { |
4409 | u64 prev; |
4410 | s64 delta; |
4411 | |
4412 | prev = atomic64_xchg(&event->hw.prev_count, now); |
4413 | delta = now - prev; |
4414 | atomic64_add(delta, &event->count); |
4415 | } |
4416 | |
4417 | static int task_clock_perf_event_enable(struct perf_event *event) |
4418 | { |
4419 | struct hw_perf_event *hwc = &event->hw; |
4420 | u64 now; |
4421 | |
4422 | now = event->ctx->time; |
4423 | |
4424 | atomic64_set(&hwc->prev_count, now); |
4425 | |
4426 | perf_swevent_start_hrtimer(event); |
4427 | |
4428 | return 0; |
4429 | } |
4430 | |
4431 | static void task_clock_perf_event_disable(struct perf_event *event) |
4432 | { |
4433 | perf_swevent_cancel_hrtimer(event); |
4434 | task_clock_perf_event_update(event, event->ctx->time); |
4435 | |
4436 | } |
4437 | |
4438 | static void task_clock_perf_event_read(struct perf_event *event) |
4439 | { |
4440 | u64 time; |
4441 | |
4442 | if (!in_nmi()) { |
4443 | update_context_time(event->ctx); |
4444 | time = event->ctx->time; |
4445 | } else { |
4446 | u64 now = perf_clock(); |
4447 | u64 delta = now - event->ctx->timestamp; |
4448 | time = event->ctx->time + delta; |
4449 | } |
4450 | |
4451 | task_clock_perf_event_update(event, time); |
4452 | } |
4453 | |
4454 | static const struct pmu perf_ops_task_clock = { |
4455 | .enable = task_clock_perf_event_enable, |
4456 | .disable = task_clock_perf_event_disable, |
4457 | .read = task_clock_perf_event_read, |
4458 | }; |
4459 | |
4460 | /* Deref the hlist from the update side */ |
4461 | static inline struct swevent_hlist * |
4462 | swevent_hlist_deref(struct perf_cpu_context *cpuctx) |
4463 | { |
4464 | return rcu_dereference_protected(cpuctx->swevent_hlist, |
4465 | lockdep_is_held(&cpuctx->hlist_mutex)); |
4466 | } |
4467 | |
4468 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) |
4469 | { |
4470 | struct swevent_hlist *hlist; |
4471 | |
4472 | hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); |
4473 | kfree(hlist); |
4474 | } |
4475 | |
4476 | static void swevent_hlist_release(struct perf_cpu_context *cpuctx) |
4477 | { |
4478 | struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); |
4479 | |
4480 | if (!hlist) |
4481 | return; |
4482 | |
4483 | rcu_assign_pointer(cpuctx->swevent_hlist, NULL); |
4484 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); |
4485 | } |
4486 | |
4487 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
4488 | { |
4489 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); |
4490 | |
4491 | mutex_lock(&cpuctx->hlist_mutex); |
4492 | |
4493 | if (!--cpuctx->hlist_refcount) |
4494 | swevent_hlist_release(cpuctx); |
4495 | |
4496 | mutex_unlock(&cpuctx->hlist_mutex); |
4497 | } |
4498 | |
4499 | static void swevent_hlist_put(struct perf_event *event) |
4500 | { |
4501 | int cpu; |
4502 | |
4503 | if (event->cpu != -1) { |
4504 | swevent_hlist_put_cpu(event, event->cpu); |
4505 | return; |
4506 | } |
4507 | |
4508 | for_each_possible_cpu(cpu) |
4509 | swevent_hlist_put_cpu(event, cpu); |
4510 | } |
4511 | |
4512 | static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) |
4513 | { |
4514 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); |
4515 | int err = 0; |
4516 | |
4517 | mutex_lock(&cpuctx->hlist_mutex); |
4518 | |
4519 | if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { |
4520 | struct swevent_hlist *hlist; |
4521 | |
4522 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
4523 | if (!hlist) { |
4524 | err = -ENOMEM; |
4525 | goto exit; |
4526 | } |
4527 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); |
4528 | } |
4529 | cpuctx->hlist_refcount++; |
4530 | exit: |
4531 | mutex_unlock(&cpuctx->hlist_mutex); |
4532 | |
4533 | return err; |
4534 | } |
4535 | |
4536 | static int swevent_hlist_get(struct perf_event *event) |
4537 | { |
4538 | int err; |
4539 | int cpu, failed_cpu; |
4540 | |
4541 | if (event->cpu != -1) |
4542 | return swevent_hlist_get_cpu(event, event->cpu); |
4543 | |
4544 | get_online_cpus(); |
4545 | for_each_possible_cpu(cpu) { |
4546 | err = swevent_hlist_get_cpu(event, cpu); |
4547 | if (err) { |
4548 | failed_cpu = cpu; |
4549 | goto fail; |
4550 | } |
4551 | } |
4552 | put_online_cpus(); |
4553 | |
4554 | return 0; |
4555 | fail: |
4556 | for_each_possible_cpu(cpu) { |
4557 | if (cpu == failed_cpu) |
4558 | break; |
4559 | swevent_hlist_put_cpu(event, cpu); |
4560 | } |
4561 | |
4562 | put_online_cpus(); |
4563 | return err; |
4564 | } |
4565 | |
4566 | #ifdef CONFIG_EVENT_TRACING |
4567 | |
4568 | static const struct pmu perf_ops_tracepoint = { |
4569 | .enable = perf_trace_enable, |
4570 | .disable = perf_trace_disable, |
4571 | .start = perf_swevent_int, |
4572 | .stop = perf_swevent_void, |
4573 | .read = perf_swevent_read, |
4574 | .unthrottle = perf_swevent_void, |
4575 | }; |
4576 | |
4577 | static int perf_tp_filter_match(struct perf_event *event, |
4578 | struct perf_sample_data *data) |
4579 | { |
4580 | void *record = data->raw->data; |
4581 | |
4582 | if (likely(!event->filter) || filter_match_preds(event->filter, record)) |
4583 | return 1; |
4584 | return 0; |
4585 | } |
4586 | |
4587 | static int perf_tp_event_match(struct perf_event *event, |
4588 | struct perf_sample_data *data, |
4589 | struct pt_regs *regs) |
4590 | { |
4591 | /* |
4592 | * All tracepoints are from kernel-space. |
4593 | */ |
4594 | if (event->attr.exclude_kernel) |
4595 | return 0; |
4596 | |
4597 | if (!perf_tp_filter_match(event, data)) |
4598 | return 0; |
4599 | |
4600 | return 1; |
4601 | } |
4602 | |
4603 | void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, |
4604 | struct pt_regs *regs, struct hlist_head *head) |
4605 | { |
4606 | struct perf_sample_data data; |
4607 | struct perf_event *event; |
4608 | struct hlist_node *node; |
4609 | |
4610 | struct perf_raw_record raw = { |
4611 | .size = entry_size, |
4612 | .data = record, |
4613 | }; |
4614 | |
4615 | perf_sample_data_init(&data, addr); |
4616 | data.raw = &raw; |
4617 | |
4618 | rcu_read_lock(); |
4619 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
4620 | if (perf_tp_event_match(event, &data, regs)) |
4621 | perf_swevent_add(event, count, 1, &data, regs); |
4622 | } |
4623 | rcu_read_unlock(); |
4624 | } |
4625 | EXPORT_SYMBOL_GPL(perf_tp_event); |
4626 | |
4627 | static void tp_perf_event_destroy(struct perf_event *event) |
4628 | { |
4629 | perf_trace_destroy(event); |
4630 | } |
4631 | |
4632 | static const struct pmu *tp_perf_event_init(struct perf_event *event) |
4633 | { |
4634 | int err; |
4635 | |
4636 | /* |
4637 | * Raw tracepoint data is a severe data leak, only allow root to |
4638 | * have these. |
4639 | */ |
4640 | if ((event->attr.sample_type & PERF_SAMPLE_RAW) && |
4641 | perf_paranoid_tracepoint_raw() && |
4642 | !capable(CAP_SYS_ADMIN)) |
4643 | return ERR_PTR(-EPERM); |
4644 | |
4645 | err = perf_trace_init(event); |
4646 | if (err) |
4647 | return NULL; |
4648 | |
4649 | event->destroy = tp_perf_event_destroy; |
4650 | |
4651 | return &perf_ops_tracepoint; |
4652 | } |
4653 | |
4654 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
4655 | { |
4656 | char *filter_str; |
4657 | int ret; |
4658 | |
4659 | if (event->attr.type != PERF_TYPE_TRACEPOINT) |
4660 | return -EINVAL; |
4661 | |
4662 | filter_str = strndup_user(arg, PAGE_SIZE); |
4663 | if (IS_ERR(filter_str)) |
4664 | return PTR_ERR(filter_str); |
4665 | |
4666 | ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); |
4667 | |
4668 | kfree(filter_str); |
4669 | return ret; |
4670 | } |
4671 | |
4672 | static void perf_event_free_filter(struct perf_event *event) |
4673 | { |
4674 | ftrace_profile_free_filter(event); |
4675 | } |
4676 | |
4677 | #else |
4678 | |
4679 | static const struct pmu *tp_perf_event_init(struct perf_event *event) |
4680 | { |
4681 | return NULL; |
4682 | } |
4683 | |
4684 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
4685 | { |
4686 | return -ENOENT; |
4687 | } |
4688 | |
4689 | static void perf_event_free_filter(struct perf_event *event) |
4690 | { |
4691 | } |
4692 | |
4693 | #endif /* CONFIG_EVENT_TRACING */ |
4694 | |
4695 | #ifdef CONFIG_HAVE_HW_BREAKPOINT |
4696 | static void bp_perf_event_destroy(struct perf_event *event) |
4697 | { |
4698 | release_bp_slot(event); |
4699 | } |
4700 | |
4701 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) |
4702 | { |
4703 | int err; |
4704 | |
4705 | err = register_perf_hw_breakpoint(bp); |
4706 | if (err) |
4707 | return ERR_PTR(err); |
4708 | |
4709 | bp->destroy = bp_perf_event_destroy; |
4710 | |
4711 | return &perf_ops_bp; |
4712 | } |
4713 | |
4714 | void perf_bp_event(struct perf_event *bp, void *data) |
4715 | { |
4716 | struct perf_sample_data sample; |
4717 | struct pt_regs *regs = data; |
4718 | |
4719 | perf_sample_data_init(&sample, bp->attr.bp_addr); |
4720 | |
4721 | if (!perf_exclude_event(bp, regs)) |
4722 | perf_swevent_add(bp, 1, 1, &sample, regs); |
4723 | } |
4724 | #else |
4725 | static const struct pmu *bp_perf_event_init(struct perf_event *bp) |
4726 | { |
4727 | return NULL; |
4728 | } |
4729 | |
4730 | void perf_bp_event(struct perf_event *bp, void *regs) |
4731 | { |
4732 | } |
4733 | #endif |
4734 | |
4735 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
4736 | |
4737 | static void sw_perf_event_destroy(struct perf_event *event) |
4738 | { |
4739 | u64 event_id = event->attr.config; |
4740 | |
4741 | WARN_ON(event->parent); |
4742 | |
4743 | atomic_dec(&perf_swevent_enabled[event_id]); |
4744 | swevent_hlist_put(event); |
4745 | } |
4746 | |
4747 | static const struct pmu *sw_perf_event_init(struct perf_event *event) |
4748 | { |
4749 | const struct pmu *pmu = NULL; |
4750 | u64 event_id = event->attr.config; |
4751 | |
4752 | /* |
4753 | * Software events (currently) can't in general distinguish |
4754 | * between user, kernel and hypervisor events. |
4755 | * However, context switches and cpu migrations are considered |
4756 | * to be kernel events, and page faults are never hypervisor |
4757 | * events. |
4758 | */ |
4759 | switch (event_id) { |
4760 | case PERF_COUNT_SW_CPU_CLOCK: |
4761 | pmu = &perf_ops_cpu_clock; |
4762 | |
4763 | break; |
4764 | case PERF_COUNT_SW_TASK_CLOCK: |
4765 | /* |
4766 | * If the user instantiates this as a per-cpu event, |
4767 | * use the cpu_clock event instead. |
4768 | */ |
4769 | if (event->ctx->task) |
4770 | pmu = &perf_ops_task_clock; |
4771 | else |
4772 | pmu = &perf_ops_cpu_clock; |
4773 | |
4774 | break; |
4775 | case PERF_COUNT_SW_PAGE_FAULTS: |
4776 | case PERF_COUNT_SW_PAGE_FAULTS_MIN: |
4777 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: |
4778 | case PERF_COUNT_SW_CONTEXT_SWITCHES: |
4779 | case PERF_COUNT_SW_CPU_MIGRATIONS: |
4780 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: |
4781 | case PERF_COUNT_SW_EMULATION_FAULTS: |
4782 | if (!event->parent) { |
4783 | int err; |
4784 | |
4785 | err = swevent_hlist_get(event); |
4786 | if (err) |
4787 | return ERR_PTR(err); |
4788 | |
4789 | atomic_inc(&perf_swevent_enabled[event_id]); |
4790 | event->destroy = sw_perf_event_destroy; |
4791 | } |
4792 | pmu = &perf_ops_generic; |
4793 | break; |
4794 | } |
4795 | |
4796 | return pmu; |
4797 | } |
4798 | |
4799 | /* |
4800 | * Allocate and initialize a event structure |
4801 | */ |
4802 | static struct perf_event * |
4803 | perf_event_alloc(struct perf_event_attr *attr, |
4804 | int cpu, |
4805 | struct perf_event_context *ctx, |
4806 | struct perf_event *group_leader, |
4807 | struct perf_event *parent_event, |
4808 | perf_overflow_handler_t overflow_handler, |
4809 | gfp_t gfpflags) |
4810 | { |
4811 | const struct pmu *pmu; |
4812 | struct perf_event *event; |
4813 | struct hw_perf_event *hwc; |
4814 | long err; |
4815 | |
4816 | event = kzalloc(sizeof(*event), gfpflags); |
4817 | if (!event) |
4818 | return ERR_PTR(-ENOMEM); |
4819 | |
4820 | /* |
4821 | * Single events are their own group leaders, with an |
4822 | * empty sibling list: |
4823 | */ |
4824 | if (!group_leader) |
4825 | group_leader = event; |
4826 | |
4827 | mutex_init(&event->child_mutex); |
4828 | INIT_LIST_HEAD(&event->child_list); |
4829 | |
4830 | INIT_LIST_HEAD(&event->group_entry); |
4831 | INIT_LIST_HEAD(&event->event_entry); |
4832 | INIT_LIST_HEAD(&event->sibling_list); |
4833 | init_waitqueue_head(&event->waitq); |
4834 | |
4835 | mutex_init(&event->mmap_mutex); |
4836 | |
4837 | event->cpu = cpu; |
4838 | event->attr = *attr; |
4839 | event->group_leader = group_leader; |
4840 | event->pmu = NULL; |
4841 | event->ctx = ctx; |
4842 | event->oncpu = -1; |
4843 | |
4844 | event->parent = parent_event; |
4845 | |
4846 | event->ns = get_pid_ns(current->nsproxy->pid_ns); |
4847 | event->id = atomic64_inc_return(&perf_event_id); |
4848 | |
4849 | event->state = PERF_EVENT_STATE_INACTIVE; |
4850 | |
4851 | if (!overflow_handler && parent_event) |
4852 | overflow_handler = parent_event->overflow_handler; |
4853 | |
4854 | event->overflow_handler = overflow_handler; |
4855 | |
4856 | if (attr->disabled) |
4857 | event->state = PERF_EVENT_STATE_OFF; |
4858 | |
4859 | pmu = NULL; |
4860 | |
4861 | hwc = &event->hw; |
4862 | hwc->sample_period = attr->sample_period; |
4863 | if (attr->freq && attr->sample_freq) |
4864 | hwc->sample_period = 1; |
4865 | hwc->last_period = hwc->sample_period; |
4866 | |
4867 | atomic64_set(&hwc->period_left, hwc->sample_period); |
4868 | |
4869 | /* |
4870 | * we currently do not support PERF_FORMAT_GROUP on inherited events |
4871 | */ |
4872 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) |
4873 | goto done; |
4874 | |
4875 | switch (attr->type) { |
4876 | case PERF_TYPE_RAW: |
4877 | case PERF_TYPE_HARDWARE: |
4878 | case PERF_TYPE_HW_CACHE: |
4879 | pmu = hw_perf_event_init(event); |
4880 | break; |
4881 | |
4882 | case PERF_TYPE_SOFTWARE: |
4883 | pmu = sw_perf_event_init(event); |
4884 | break; |
4885 | |
4886 | case PERF_TYPE_TRACEPOINT: |
4887 | pmu = tp_perf_event_init(event); |
4888 | break; |
4889 | |
4890 | case PERF_TYPE_BREAKPOINT: |
4891 | pmu = bp_perf_event_init(event); |
4892 | break; |
4893 | |
4894 | |
4895 | default: |
4896 | break; |
4897 | } |
4898 | done: |
4899 | err = 0; |
4900 | if (!pmu) |
4901 | err = -EINVAL; |
4902 | else if (IS_ERR(pmu)) |
4903 | err = PTR_ERR(pmu); |
4904 | |
4905 | if (err) { |
4906 | if (event->ns) |
4907 | put_pid_ns(event->ns); |
4908 | kfree(event); |
4909 | return ERR_PTR(err); |
4910 | } |
4911 | |
4912 | event->pmu = pmu; |
4913 | |
4914 | if (!event->parent) { |
4915 | atomic_inc(&nr_events); |
4916 | if (event->attr.mmap) |
4917 | atomic_inc(&nr_mmap_events); |
4918 | if (event->attr.comm) |
4919 | atomic_inc(&nr_comm_events); |
4920 | if (event->attr.task) |
4921 | atomic_inc(&nr_task_events); |
4922 | } |
4923 | |
4924 | return event; |
4925 | } |
4926 | |
4927 | static int perf_copy_attr(struct perf_event_attr __user *uattr, |
4928 | struct perf_event_attr *attr) |
4929 | { |
4930 | u32 size; |
4931 | int ret; |
4932 | |
4933 | if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) |
4934 | return -EFAULT; |
4935 | |
4936 | /* |
4937 | * zero the full structure, so that a short copy will be nice. |
4938 | */ |
4939 | memset(attr, 0, sizeof(*attr)); |
4940 | |
4941 | ret = get_user(size, &uattr->size); |
4942 | if (ret) |
4943 | return ret; |
4944 | |
4945 | if (size > PAGE_SIZE) /* silly large */ |
4946 | goto err_size; |
4947 | |
4948 | if (!size) /* abi compat */ |
4949 | size = PERF_ATTR_SIZE_VER0; |
4950 | |
4951 | if (size < PERF_ATTR_SIZE_VER0) |
4952 | goto err_size; |
4953 | |
4954 | /* |
4955 | * If we're handed a bigger struct than we know of, |
4956 | * ensure all the unknown bits are 0 - i.e. new |
4957 | * user-space does not rely on any kernel feature |
4958 | * extensions we dont know about yet. |
4959 | */ |
4960 | if (size > sizeof(*attr)) { |
4961 | unsigned char __user *addr; |
4962 | unsigned char __user *end; |
4963 | unsigned char val; |
4964 | |
4965 | addr = (void __user *)uattr + sizeof(*attr); |
4966 | end = (void __user *)uattr + size; |
4967 | |
4968 | for (; addr < end; addr++) { |
4969 | ret = get_user(val, addr); |
4970 | if (ret) |
4971 | return ret; |
4972 | if (val) |
4973 | goto err_size; |
4974 | } |
4975 | size = sizeof(*attr); |
4976 | } |
4977 | |
4978 | ret = copy_from_user(attr, uattr, size); |
4979 | if (ret) |
4980 | return -EFAULT; |
4981 | |
4982 | /* |
4983 | * If the type exists, the corresponding creation will verify |
4984 | * the attr->config. |
4985 | */ |
4986 | if (attr->type >= PERF_TYPE_MAX) |
4987 | return -EINVAL; |
4988 | |
4989 | if (attr->__reserved_1) |
4990 | return -EINVAL; |
4991 | |
4992 | if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) |
4993 | return -EINVAL; |
4994 | |
4995 | if (attr->read_format & ~(PERF_FORMAT_MAX-1)) |
4996 | return -EINVAL; |
4997 | |
4998 | out: |
4999 | return ret; |
5000 | |
5001 | err_size: |
5002 | put_user(sizeof(*attr), &uattr->size); |
5003 | ret = -E2BIG; |
5004 | goto out; |
5005 | } |
5006 | |
5007 | static int |
5008 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
5009 | { |
5010 | struct perf_mmap_data *data = NULL, *old_data = NULL; |
5011 | int ret = -EINVAL; |
5012 | |
5013 | if (!output_event) |
5014 | goto set; |
5015 | |
5016 | /* don't allow circular references */ |
5017 | if (event == output_event) |
5018 | goto out; |
5019 | |
5020 | /* |
5021 | * Don't allow cross-cpu buffers |
5022 | */ |
5023 | if (output_event->cpu != event->cpu) |
5024 | goto out; |
5025 | |
5026 | /* |
5027 | * If its not a per-cpu buffer, it must be the same task. |
5028 | */ |
5029 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) |
5030 | goto out; |
5031 | |
5032 | set: |
5033 | mutex_lock(&event->mmap_mutex); |
5034 | /* Can't redirect output if we've got an active mmap() */ |
5035 | if (atomic_read(&event->mmap_count)) |
5036 | goto unlock; |
5037 | |
5038 | if (output_event) { |
5039 | /* get the buffer we want to redirect to */ |
5040 | data = perf_mmap_data_get(output_event); |
5041 | if (!data) |
5042 | goto unlock; |
5043 | } |
5044 | |
5045 | old_data = event->data; |
5046 | rcu_assign_pointer(event->data, data); |
5047 | ret = 0; |
5048 | unlock: |
5049 | mutex_unlock(&event->mmap_mutex); |
5050 | |
5051 | if (old_data) |
5052 | perf_mmap_data_put(old_data); |
5053 | out: |
5054 | return ret; |
5055 | } |
5056 | |
5057 | /** |
5058 | * sys_perf_event_open - open a performance event, associate it to a task/cpu |
5059 | * |
5060 | * @attr_uptr: event_id type attributes for monitoring/sampling |
5061 | * @pid: target pid |
5062 | * @cpu: target cpu |
5063 | * @group_fd: group leader event fd |
5064 | */ |
5065 | SYSCALL_DEFINE5(perf_event_open, |
5066 | struct perf_event_attr __user *, attr_uptr, |
5067 | pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) |
5068 | { |
5069 | struct perf_event *event, *group_leader = NULL, *output_event = NULL; |
5070 | struct perf_event_attr attr; |
5071 | struct perf_event_context *ctx; |
5072 | struct file *event_file = NULL; |
5073 | struct file *group_file = NULL; |
5074 | int event_fd; |
5075 | int fput_needed = 0; |
5076 | int err; |
5077 | |
5078 | /* for future expandability... */ |
5079 | if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) |
5080 | return -EINVAL; |
5081 | |
5082 | err = perf_copy_attr(attr_uptr, &attr); |
5083 | if (err) |
5084 | return err; |
5085 | |
5086 | if (!attr.exclude_kernel) { |
5087 | if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) |
5088 | return -EACCES; |
5089 | } |
5090 | |
5091 | if (attr.freq) { |
5092 | if (attr.sample_freq > sysctl_perf_event_sample_rate) |
5093 | return -EINVAL; |
5094 | } |
5095 | |
5096 | event_fd = get_unused_fd_flags(O_RDWR); |
5097 | if (event_fd < 0) |
5098 | return event_fd; |
5099 | |
5100 | /* |
5101 | * Get the target context (task or percpu): |
5102 | */ |
5103 | ctx = find_get_context(pid, cpu); |
5104 | if (IS_ERR(ctx)) { |
5105 | err = PTR_ERR(ctx); |
5106 | goto err_fd; |
5107 | } |
5108 | |
5109 | if (group_fd != -1) { |
5110 | group_leader = perf_fget_light(group_fd, &fput_needed); |
5111 | if (IS_ERR(group_leader)) { |
5112 | err = PTR_ERR(group_leader); |
5113 | goto err_put_context; |
5114 | } |
5115 | group_file = group_leader->filp; |
5116 | if (flags & PERF_FLAG_FD_OUTPUT) |
5117 | output_event = group_leader; |
5118 | if (flags & PERF_FLAG_FD_NO_GROUP) |
5119 | group_leader = NULL; |
5120 | } |
5121 | |
5122 | /* |
5123 | * Look up the group leader (we will attach this event to it): |
5124 | */ |
5125 | if (group_leader) { |
5126 | err = -EINVAL; |
5127 | |
5128 | /* |
5129 | * Do not allow a recursive hierarchy (this new sibling |
5130 | * becoming part of another group-sibling): |
5131 | */ |
5132 | if (group_leader->group_leader != group_leader) |
5133 | goto err_put_context; |
5134 | /* |
5135 | * Do not allow to attach to a group in a different |
5136 | * task or CPU context: |
5137 | */ |
5138 | if (group_leader->ctx != ctx) |
5139 | goto err_put_context; |
5140 | /* |
5141 | * Only a group leader can be exclusive or pinned |
5142 | */ |
5143 | if (attr.exclusive || attr.pinned) |
5144 | goto err_put_context; |
5145 | } |
5146 | |
5147 | event = perf_event_alloc(&attr, cpu, ctx, group_leader, |
5148 | NULL, NULL, GFP_KERNEL); |
5149 | if (IS_ERR(event)) { |
5150 | err = PTR_ERR(event); |
5151 | goto err_put_context; |
5152 | } |
5153 | |
5154 | if (output_event) { |
5155 | err = perf_event_set_output(event, output_event); |
5156 | if (err) |
5157 | goto err_free_put_context; |
5158 | } |
5159 | |
5160 | event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); |
5161 | if (IS_ERR(event_file)) { |
5162 | err = PTR_ERR(event_file); |
5163 | goto err_free_put_context; |
5164 | } |
5165 | |
5166 | event->filp = event_file; |
5167 | WARN_ON_ONCE(ctx->parent_ctx); |
5168 | mutex_lock(&ctx->mutex); |
5169 | perf_install_in_context(ctx, event, cpu); |
5170 | ++ctx->generation; |
5171 | mutex_unlock(&ctx->mutex); |
5172 | |
5173 | event->owner = current; |
5174 | get_task_struct(current); |
5175 | mutex_lock(¤t->perf_event_mutex); |
5176 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); |
5177 | mutex_unlock(¤t->perf_event_mutex); |
5178 | |
5179 | /* |
5180 | * Drop the reference on the group_event after placing the |
5181 | * new event on the sibling_list. This ensures destruction |
5182 | * of the group leader will find the pointer to itself in |
5183 | * perf_group_detach(). |
5184 | */ |
5185 | fput_light(group_file, fput_needed); |
5186 | fd_install(event_fd, event_file); |
5187 | return event_fd; |
5188 | |
5189 | err_free_put_context: |
5190 | free_event(event); |
5191 | err_put_context: |
5192 | fput_light(group_file, fput_needed); |
5193 | put_ctx(ctx); |
5194 | err_fd: |
5195 | put_unused_fd(event_fd); |
5196 | return err; |
5197 | } |
5198 | |
5199 | /** |
5200 | * perf_event_create_kernel_counter |
5201 | * |
5202 | * @attr: attributes of the counter to create |
5203 | * @cpu: cpu in which the counter is bound |
5204 | * @pid: task to profile |
5205 | */ |
5206 | struct perf_event * |
5207 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
5208 | pid_t pid, |
5209 | perf_overflow_handler_t overflow_handler) |
5210 | { |
5211 | struct perf_event *event; |
5212 | struct perf_event_context *ctx; |
5213 | int err; |
5214 | |
5215 | /* |
5216 | * Get the target context (task or percpu): |
5217 | */ |
5218 | |
5219 | ctx = find_get_context(pid, cpu); |
5220 | if (IS_ERR(ctx)) { |
5221 | err = PTR_ERR(ctx); |
5222 | goto err_exit; |
5223 | } |
5224 | |
5225 | event = perf_event_alloc(attr, cpu, ctx, NULL, |
5226 | NULL, overflow_handler, GFP_KERNEL); |
5227 | if (IS_ERR(event)) { |
5228 | err = PTR_ERR(event); |
5229 | goto err_put_context; |
5230 | } |
5231 | |
5232 | event->filp = NULL; |
5233 | WARN_ON_ONCE(ctx->parent_ctx); |
5234 | mutex_lock(&ctx->mutex); |
5235 | perf_install_in_context(ctx, event, cpu); |
5236 | ++ctx->generation; |
5237 | mutex_unlock(&ctx->mutex); |
5238 | |
5239 | event->owner = current; |
5240 | get_task_struct(current); |
5241 | mutex_lock(¤t->perf_event_mutex); |
5242 | list_add_tail(&event->owner_entry, ¤t->perf_event_list); |
5243 | mutex_unlock(¤t->perf_event_mutex); |
5244 | |
5245 | return event; |
5246 | |
5247 | err_put_context: |
5248 | put_ctx(ctx); |
5249 | err_exit: |
5250 | return ERR_PTR(err); |
5251 | } |
5252 | EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); |
5253 | |
5254 | /* |
5255 | * inherit a event from parent task to child task: |
5256 | */ |
5257 | static struct perf_event * |
5258 | inherit_event(struct perf_event *parent_event, |
5259 | struct task_struct *parent, |
5260 | struct perf_event_context *parent_ctx, |
5261 | struct task_struct *child, |
5262 | struct perf_event *group_leader, |
5263 | struct perf_event_context *child_ctx) |
5264 | { |
5265 | struct perf_event *child_event; |
5266 | |
5267 | /* |
5268 | * Instead of creating recursive hierarchies of events, |
5269 | * we link inherited events back to the original parent, |
5270 | * which has a filp for sure, which we use as the reference |
5271 | * count: |
5272 | */ |
5273 | if (parent_event->parent) |
5274 | parent_event = parent_event->parent; |
5275 | |
5276 | child_event = perf_event_alloc(&parent_event->attr, |
5277 | parent_event->cpu, child_ctx, |
5278 | group_leader, parent_event, |
5279 | NULL, GFP_KERNEL); |
5280 | if (IS_ERR(child_event)) |
5281 | return child_event; |
5282 | get_ctx(child_ctx); |
5283 | |
5284 | /* |
5285 | * Make the child state follow the state of the parent event, |
5286 | * not its attr.disabled bit. We hold the parent's mutex, |
5287 | * so we won't race with perf_event_{en, dis}able_family. |
5288 | */ |
5289 | if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) |
5290 | child_event->state = PERF_EVENT_STATE_INACTIVE; |
5291 | else |
5292 | child_event->state = PERF_EVENT_STATE_OFF; |
5293 | |
5294 | if (parent_event->attr.freq) { |
5295 | u64 sample_period = parent_event->hw.sample_period; |
5296 | struct hw_perf_event *hwc = &child_event->hw; |
5297 | |
5298 | hwc->sample_period = sample_period; |
5299 | hwc->last_period = sample_period; |
5300 | |
5301 | atomic64_set(&hwc->period_left, sample_period); |
5302 | } |
5303 | |
5304 | child_event->overflow_handler = parent_event->overflow_handler; |
5305 | |
5306 | /* |
5307 | * Link it up in the child's context: |
5308 | */ |
5309 | add_event_to_ctx(child_event, child_ctx); |
5310 | |
5311 | /* |
5312 | * Get a reference to the parent filp - we will fput it |
5313 | * when the child event exits. This is safe to do because |
5314 | * we are in the parent and we know that the filp still |
5315 | * exists and has a nonzero count: |
5316 | */ |
5317 | atomic_long_inc(&parent_event->filp->f_count); |
5318 | |
5319 | /* |
5320 | * Link this into the parent event's child list |
5321 | */ |
5322 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); |
5323 | mutex_lock(&parent_event->child_mutex); |
5324 | list_add_tail(&child_event->child_list, &parent_event->child_list); |
5325 | mutex_unlock(&parent_event->child_mutex); |
5326 | |
5327 | return child_event; |
5328 | } |
5329 | |
5330 | static int inherit_group(struct perf_event *parent_event, |
5331 | struct task_struct *parent, |
5332 | struct perf_event_context *parent_ctx, |
5333 | struct task_struct *child, |
5334 | struct perf_event_context *child_ctx) |
5335 | { |
5336 | struct perf_event *leader; |
5337 | struct perf_event *sub; |
5338 | struct perf_event *child_ctr; |
5339 | |
5340 | leader = inherit_event(parent_event, parent, parent_ctx, |
5341 | child, NULL, child_ctx); |
5342 | if (IS_ERR(leader)) |
5343 | return PTR_ERR(leader); |
5344 | list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { |
5345 | child_ctr = inherit_event(sub, parent, parent_ctx, |
5346 | child, leader, child_ctx); |
5347 | if (IS_ERR(child_ctr)) |
5348 | return PTR_ERR(child_ctr); |
5349 | } |
5350 | return 0; |
5351 | } |
5352 | |
5353 | static void sync_child_event(struct perf_event *child_event, |
5354 | struct task_struct *child) |
5355 | { |
5356 | struct perf_event *parent_event = child_event->parent; |
5357 | u64 child_val; |
5358 | |
5359 | if (child_event->attr.inherit_stat) |
5360 | perf_event_read_event(child_event, child); |
5361 | |
5362 | child_val = atomic64_read(&child_event->count); |
5363 | |
5364 | /* |
5365 | * Add back the child's count to the parent's count: |
5366 | */ |
5367 | atomic64_add(child_val, &parent_event->count); |
5368 | atomic64_add(child_event->total_time_enabled, |
5369 | &parent_event->child_total_time_enabled); |
5370 | atomic64_add(child_event->total_time_running, |
5371 | &parent_event->child_total_time_running); |
5372 | |
5373 | /* |
5374 | * Remove this event from the parent's list |
5375 | */ |
5376 | WARN_ON_ONCE(parent_event->ctx->parent_ctx); |
5377 | mutex_lock(&parent_event->child_mutex); |
5378 | list_del_init(&child_event->child_list); |
5379 | mutex_unlock(&parent_event->child_mutex); |
5380 | |
5381 | /* |
5382 | * Release the parent event, if this was the last |
5383 | * reference to it. |
5384 | */ |
5385 | fput(parent_event->filp); |
5386 | } |
5387 | |
5388 | static void |
5389 | __perf_event_exit_task(struct perf_event *child_event, |
5390 | struct perf_event_context *child_ctx, |
5391 | struct task_struct *child) |
5392 | { |
5393 | struct perf_event *parent_event; |
5394 | |
5395 | perf_event_remove_from_context(child_event); |
5396 | |
5397 | parent_event = child_event->parent; |
5398 | /* |
5399 | * It can happen that parent exits first, and has events |
5400 | * that are still around due to the child reference. These |
5401 | * events need to be zapped - but otherwise linger. |
5402 | */ |
5403 | if (parent_event) { |
5404 | sync_child_event(child_event, child); |
5405 | free_event(child_event); |
5406 | } |
5407 | } |
5408 | |
5409 | /* |
5410 | * When a child task exits, feed back event values to parent events. |
5411 | */ |
5412 | void perf_event_exit_task(struct task_struct *child) |
5413 | { |
5414 | struct perf_event *child_event, *tmp; |
5415 | struct perf_event_context *child_ctx; |
5416 | unsigned long flags; |
5417 | |
5418 | if (likely(!child->perf_event_ctxp)) { |
5419 | perf_event_task(child, NULL, 0); |
5420 | return; |
5421 | } |
5422 | |
5423 | local_irq_save(flags); |
5424 | /* |
5425 | * We can't reschedule here because interrupts are disabled, |
5426 | * and either child is current or it is a task that can't be |
5427 | * scheduled, so we are now safe from rescheduling changing |
5428 | * our context. |
5429 | */ |
5430 | child_ctx = child->perf_event_ctxp; |
5431 | __perf_event_task_sched_out(child_ctx); |
5432 | |
5433 | /* |
5434 | * Take the context lock here so that if find_get_context is |
5435 | * reading child->perf_event_ctxp, we wait until it has |
5436 | * incremented the context's refcount before we do put_ctx below. |
5437 | */ |
5438 | raw_spin_lock(&child_ctx->lock); |
5439 | child->perf_event_ctxp = NULL; |
5440 | /* |
5441 | * If this context is a clone; unclone it so it can't get |
5442 | * swapped to another process while we're removing all |
5443 | * the events from it. |
5444 | */ |
5445 | unclone_ctx(child_ctx); |
5446 | update_context_time(child_ctx); |
5447 | raw_spin_unlock_irqrestore(&child_ctx->lock, flags); |
5448 | |
5449 | /* |
5450 | * Report the task dead after unscheduling the events so that we |
5451 | * won't get any samples after PERF_RECORD_EXIT. We can however still |
5452 | * get a few PERF_RECORD_READ events. |
5453 | */ |
5454 | perf_event_task(child, child_ctx, 0); |
5455 | |
5456 | /* |
5457 | * We can recurse on the same lock type through: |
5458 | * |
5459 | * __perf_event_exit_task() |
5460 | * sync_child_event() |
5461 | * fput(parent_event->filp) |
5462 | * perf_release() |
5463 | * mutex_lock(&ctx->mutex) |
5464 | * |
5465 | * But since its the parent context it won't be the same instance. |
5466 | */ |
5467 | mutex_lock(&child_ctx->mutex); |
5468 | |
5469 | again: |
5470 | list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, |
5471 | group_entry) |
5472 | __perf_event_exit_task(child_event, child_ctx, child); |
5473 | |
5474 | list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, |
5475 | group_entry) |
5476 | __perf_event_exit_task(child_event, child_ctx, child); |
5477 | |
5478 | /* |
5479 | * If the last event was a group event, it will have appended all |
5480 | * its siblings to the list, but we obtained 'tmp' before that which |
5481 | * will still point to the list head terminating the iteration. |
5482 | */ |
5483 | if (!list_empty(&child_ctx->pinned_groups) || |
5484 | !list_empty(&child_ctx->flexible_groups)) |
5485 | goto again; |
5486 | |
5487 | mutex_unlock(&child_ctx->mutex); |
5488 | |
5489 | put_ctx(child_ctx); |
5490 | } |
5491 | |
5492 | static void perf_free_event(struct perf_event *event, |
5493 | struct perf_event_context *ctx) |
5494 | { |
5495 | struct perf_event *parent = event->parent; |
5496 | |
5497 | if (WARN_ON_ONCE(!parent)) |
5498 | return; |
5499 | |
5500 | mutex_lock(&parent->child_mutex); |
5501 | list_del_init(&event->child_list); |
5502 | mutex_unlock(&parent->child_mutex); |
5503 | |
5504 | fput(parent->filp); |
5505 | |
5506 | perf_group_detach(event); |
5507 | list_del_event(event, ctx); |
5508 | free_event(event); |
5509 | } |
5510 | |
5511 | /* |
5512 | * free an unexposed, unused context as created by inheritance by |
5513 | * init_task below, used by fork() in case of fail. |
5514 | */ |
5515 | void perf_event_free_task(struct task_struct *task) |
5516 | { |
5517 | struct perf_event_context *ctx = task->perf_event_ctxp; |
5518 | struct perf_event *event, *tmp; |
5519 | |
5520 | if (!ctx) |
5521 | return; |
5522 | |
5523 | mutex_lock(&ctx->mutex); |
5524 | again: |
5525 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
5526 | perf_free_event(event, ctx); |
5527 | |
5528 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, |
5529 | group_entry) |
5530 | perf_free_event(event, ctx); |
5531 | |
5532 | if (!list_empty(&ctx->pinned_groups) || |
5533 | !list_empty(&ctx->flexible_groups)) |
5534 | goto again; |
5535 | |
5536 | mutex_unlock(&ctx->mutex); |
5537 | |
5538 | put_ctx(ctx); |
5539 | } |
5540 | |
5541 | static int |
5542 | inherit_task_group(struct perf_event *event, struct task_struct *parent, |
5543 | struct perf_event_context *parent_ctx, |
5544 | struct task_struct *child, |
5545 | int *inherited_all) |
5546 | { |
5547 | int ret; |
5548 | struct perf_event_context *child_ctx = child->perf_event_ctxp; |
5549 | |
5550 | if (!event->attr.inherit) { |
5551 | *inherited_all = 0; |
5552 | return 0; |
5553 | } |
5554 | |
5555 | if (!child_ctx) { |
5556 | /* |
5557 | * This is executed from the parent task context, so |
5558 | * inherit events that have been marked for cloning. |
5559 | * First allocate and initialize a context for the |
5560 | * child. |
5561 | */ |
5562 | |
5563 | child_ctx = kzalloc(sizeof(struct perf_event_context), |
5564 | GFP_KERNEL); |
5565 | if (!child_ctx) |
5566 | return -ENOMEM; |
5567 | |
5568 | __perf_event_init_context(child_ctx, child); |
5569 | child->perf_event_ctxp = child_ctx; |
5570 | get_task_struct(child); |
5571 | } |
5572 | |
5573 | ret = inherit_group(event, parent, parent_ctx, |
5574 | child, child_ctx); |
5575 | |
5576 | if (ret) |
5577 | *inherited_all = 0; |
5578 | |
5579 | return ret; |
5580 | } |
5581 | |
5582 | |
5583 | /* |
5584 | * Initialize the perf_event context in task_struct |
5585 | */ |
5586 | int perf_event_init_task(struct task_struct *child) |
5587 | { |
5588 | struct perf_event_context *child_ctx, *parent_ctx; |
5589 | struct perf_event_context *cloned_ctx; |
5590 | struct perf_event *event; |
5591 | struct task_struct *parent = current; |
5592 | int inherited_all = 1; |
5593 | int ret = 0; |
5594 | |
5595 | child->perf_event_ctxp = NULL; |
5596 | |
5597 | mutex_init(&child->perf_event_mutex); |
5598 | INIT_LIST_HEAD(&child->perf_event_list); |
5599 | |
5600 | if (likely(!parent->perf_event_ctxp)) |
5601 | return 0; |
5602 | |
5603 | /* |
5604 | * If the parent's context is a clone, pin it so it won't get |
5605 | * swapped under us. |
5606 | */ |
5607 | parent_ctx = perf_pin_task_context(parent); |
5608 | |
5609 | /* |
5610 | * No need to check if parent_ctx != NULL here; since we saw |
5611 | * it non-NULL earlier, the only reason for it to become NULL |
5612 | * is if we exit, and since we're currently in the middle of |
5613 | * a fork we can't be exiting at the same time. |
5614 | */ |
5615 | |
5616 | /* |
5617 | * Lock the parent list. No need to lock the child - not PID |
5618 | * hashed yet and not running, so nobody can access it. |
5619 | */ |
5620 | mutex_lock(&parent_ctx->mutex); |
5621 | |
5622 | /* |
5623 | * We dont have to disable NMIs - we are only looking at |
5624 | * the list, not manipulating it: |
5625 | */ |
5626 | list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { |
5627 | ret = inherit_task_group(event, parent, parent_ctx, child, |
5628 | &inherited_all); |
5629 | if (ret) |
5630 | break; |
5631 | } |
5632 | |
5633 | list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { |
5634 | ret = inherit_task_group(event, parent, parent_ctx, child, |
5635 | &inherited_all); |
5636 | if (ret) |
5637 | break; |
5638 | } |
5639 | |
5640 | child_ctx = child->perf_event_ctxp; |
5641 | |
5642 | if (child_ctx && inherited_all) { |
5643 | /* |
5644 | * Mark the child context as a clone of the parent |
5645 | * context, or of whatever the parent is a clone of. |
5646 | * Note that if the parent is a clone, it could get |
5647 | * uncloned at any point, but that doesn't matter |
5648 | * because the list of events and the generation |
5649 | * count can't have changed since we took the mutex. |
5650 | */ |
5651 | cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); |
5652 | if (cloned_ctx) { |
5653 | child_ctx->parent_ctx = cloned_ctx; |
5654 | child_ctx->parent_gen = parent_ctx->parent_gen; |
5655 | } else { |
5656 | child_ctx->parent_ctx = parent_ctx; |
5657 | child_ctx->parent_gen = parent_ctx->generation; |
5658 | } |
5659 | get_ctx(child_ctx->parent_ctx); |
5660 | } |
5661 | |
5662 | mutex_unlock(&parent_ctx->mutex); |
5663 | |
5664 | perf_unpin_context(parent_ctx); |
5665 | |
5666 | return ret; |
5667 | } |
5668 | |
5669 | static void __init perf_event_init_all_cpus(void) |
5670 | { |
5671 | int cpu; |
5672 | struct perf_cpu_context *cpuctx; |
5673 | |
5674 | for_each_possible_cpu(cpu) { |
5675 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
5676 | mutex_init(&cpuctx->hlist_mutex); |
5677 | __perf_event_init_context(&cpuctx->ctx, NULL); |
5678 | } |
5679 | } |
5680 | |
5681 | static void __cpuinit perf_event_init_cpu(int cpu) |
5682 | { |
5683 | struct perf_cpu_context *cpuctx; |
5684 | |
5685 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
5686 | |
5687 | spin_lock(&perf_resource_lock); |
5688 | cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; |
5689 | spin_unlock(&perf_resource_lock); |
5690 | |
5691 | mutex_lock(&cpuctx->hlist_mutex); |
5692 | if (cpuctx->hlist_refcount > 0) { |
5693 | struct swevent_hlist *hlist; |
5694 | |
5695 | hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); |
5696 | WARN_ON_ONCE(!hlist); |
5697 | rcu_assign_pointer(cpuctx->swevent_hlist, hlist); |
5698 | } |
5699 | mutex_unlock(&cpuctx->hlist_mutex); |
5700 | } |
5701 | |
5702 | #ifdef CONFIG_HOTPLUG_CPU |
5703 | static void __perf_event_exit_cpu(void *info) |
5704 | { |
5705 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); |
5706 | struct perf_event_context *ctx = &cpuctx->ctx; |
5707 | struct perf_event *event, *tmp; |
5708 | |
5709 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
5710 | __perf_event_remove_from_context(event); |
5711 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
5712 | __perf_event_remove_from_context(event); |
5713 | } |
5714 | static void perf_event_exit_cpu(int cpu) |
5715 | { |
5716 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); |
5717 | struct perf_event_context *ctx = &cpuctx->ctx; |
5718 | |
5719 | mutex_lock(&cpuctx->hlist_mutex); |
5720 | swevent_hlist_release(cpuctx); |
5721 | mutex_unlock(&cpuctx->hlist_mutex); |
5722 | |
5723 | mutex_lock(&ctx->mutex); |
5724 | smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); |
5725 | mutex_unlock(&ctx->mutex); |
5726 | } |
5727 | #else |
5728 | static inline void perf_event_exit_cpu(int cpu) { } |
5729 | #endif |
5730 | |
5731 | static int __cpuinit |
5732 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) |
5733 | { |
5734 | unsigned int cpu = (long)hcpu; |
5735 | |
5736 | switch (action) { |
5737 | |
5738 | case CPU_UP_PREPARE: |
5739 | case CPU_UP_PREPARE_FROZEN: |
5740 | perf_event_init_cpu(cpu); |
5741 | break; |
5742 | |
5743 | case CPU_DOWN_PREPARE: |
5744 | case CPU_DOWN_PREPARE_FROZEN: |
5745 | perf_event_exit_cpu(cpu); |
5746 | break; |
5747 | |
5748 | default: |
5749 | break; |
5750 | } |
5751 | |
5752 | return NOTIFY_OK; |
5753 | } |
5754 | |
5755 | /* |
5756 | * This has to have a higher priority than migration_notifier in sched.c. |
5757 | */ |
5758 | static struct notifier_block __cpuinitdata perf_cpu_nb = { |
5759 | .notifier_call = perf_cpu_notify, |
5760 | .priority = 20, |
5761 | }; |
5762 | |
5763 | void __init perf_event_init(void) |
5764 | { |
5765 | perf_event_init_all_cpus(); |
5766 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, |
5767 | (void *)(long)smp_processor_id()); |
5768 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, |
5769 | (void *)(long)smp_processor_id()); |
5770 | register_cpu_notifier(&perf_cpu_nb); |
5771 | } |
5772 | |
5773 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, |
5774 | struct sysdev_class_attribute *attr, |
5775 | char *buf) |
5776 | { |
5777 | return sprintf(buf, "%d\n", perf_reserved_percpu); |
5778 | } |
5779 | |
5780 | static ssize_t |
5781 | perf_set_reserve_percpu(struct sysdev_class *class, |
5782 | struct sysdev_class_attribute *attr, |
5783 | const char *buf, |
5784 | size_t count) |
5785 | { |
5786 | struct perf_cpu_context *cpuctx; |
5787 | unsigned long val; |
5788 | int err, cpu, mpt; |
5789 | |
5790 | err = strict_strtoul(buf, 10, &val); |
5791 | if (err) |
5792 | return err; |
5793 | if (val > perf_max_events) |
5794 | return -EINVAL; |
5795 | |
5796 | spin_lock(&perf_resource_lock); |
5797 | perf_reserved_percpu = val; |
5798 | for_each_online_cpu(cpu) { |
5799 | cpuctx = &per_cpu(perf_cpu_context, cpu); |
5800 | raw_spin_lock_irq(&cpuctx->ctx.lock); |
5801 | mpt = min(perf_max_events - cpuctx->ctx.nr_events, |
5802 | perf_max_events - perf_reserved_percpu); |
5803 | cpuctx->max_pertask = mpt; |
5804 | raw_spin_unlock_irq(&cpuctx->ctx.lock); |
5805 | } |
5806 | spin_unlock(&perf_resource_lock); |
5807 | |
5808 | return count; |
5809 | } |
5810 | |
5811 | static ssize_t perf_show_overcommit(struct sysdev_class *class, |
5812 | struct sysdev_class_attribute *attr, |
5813 | char *buf) |
5814 | { |
5815 | return sprintf(buf, "%d\n", perf_overcommit); |
5816 | } |
5817 | |
5818 | static ssize_t |
5819 | perf_set_overcommit(struct sysdev_class *class, |
5820 | struct sysdev_class_attribute *attr, |
5821 | const char *buf, size_t count) |
5822 | { |
5823 | unsigned long val; |
5824 | int err; |
5825 | |
5826 | err = strict_strtoul(buf, 10, &val); |
5827 | if (err) |
5828 | return err; |
5829 | if (val > 1) |
5830 | return -EINVAL; |
5831 | |
5832 | spin_lock(&perf_resource_lock); |
5833 | perf_overcommit = val; |
5834 | spin_unlock(&perf_resource_lock); |
5835 | |
5836 | return count; |
5837 | } |
5838 | |
5839 | static SYSDEV_CLASS_ATTR( |
5840 | reserve_percpu, |
5841 | 0644, |
5842 | perf_show_reserve_percpu, |
5843 | perf_set_reserve_percpu |
5844 | ); |
5845 | |
5846 | static SYSDEV_CLASS_ATTR( |
5847 | overcommit, |
5848 | 0644, |
5849 | perf_show_overcommit, |
5850 | perf_set_overcommit |
5851 | ); |
5852 | |
5853 | static struct attribute *perfclass_attrs[] = { |
5854 | &attr_reserve_percpu.attr, |
5855 | &attr_overcommit.attr, |
5856 | NULL |
5857 | }; |
5858 | |
5859 | static struct attribute_group perfclass_attr_group = { |
5860 | .attrs = perfclass_attrs, |
5861 | .name = "perf_events", |
5862 | }; |
5863 | |
5864 | static int __init perf_event_sysfs_init(void) |
5865 | { |
5866 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, |
5867 | &perfclass_attr_group); |
5868 | } |
5869 | device_initcall(perf_event_sysfs_init); |
5870 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9