Root/
1 | /* |
2 | * builtin-stat.c |
3 | * |
4 | * Builtin stat command: Give a precise performance counters summary |
5 | * overview about any workload, CPU or specific PID. |
6 | * |
7 | * Sample output: |
8 | |
9 | $ perf stat ~/hackbench 10 |
10 | Time: 0.104 |
11 | |
12 | Performance counter stats for '/home/mingo/hackbench': |
13 | |
14 | 1255.538611 task clock ticks # 10.143 CPU utilization factor |
15 | 54011 context switches # 0.043 M/sec |
16 | 385 CPU migrations # 0.000 M/sec |
17 | 17755 pagefaults # 0.014 M/sec |
18 | 3808323185 CPU cycles # 3033.219 M/sec |
19 | 1575111190 instructions # 1254.530 M/sec |
20 | 17367895 cache references # 13.833 M/sec |
21 | 7674421 cache misses # 6.112 M/sec |
22 | |
23 | Wall-clock time elapsed: 123.786620 msecs |
24 | |
25 | * |
26 | * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> |
27 | * |
28 | * Improvements and fixes by: |
29 | * |
30 | * Arjan van de Ven <arjan@linux.intel.com> |
31 | * Yanmin Zhang <yanmin.zhang@intel.com> |
32 | * Wu Fengguang <fengguang.wu@intel.com> |
33 | * Mike Galbraith <efault@gmx.de> |
34 | * Paul Mackerras <paulus@samba.org> |
35 | * Jaswinder Singh Rajput <jaswinder@kernel.org> |
36 | * |
37 | * Released under the GPL v2. (and only v2, not any later version) |
38 | */ |
39 | |
40 | #include "perf.h" |
41 | #include "builtin.h" |
42 | #include "util/util.h" |
43 | #include "util/parse-options.h" |
44 | #include "util/parse-events.h" |
45 | #include "util/event.h" |
46 | #include "util/debug.h" |
47 | #include "util/header.h" |
48 | #include "util/cpumap.h" |
49 | |
50 | #include <sys/prctl.h> |
51 | #include <math.h> |
52 | |
53 | static struct perf_event_attr default_attrs[] = { |
54 | |
55 | { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, |
56 | { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, |
57 | { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, |
58 | { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, |
59 | |
60 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, |
61 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, |
62 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, |
63 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, |
64 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES }, |
65 | { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES }, |
66 | |
67 | }; |
68 | |
69 | static int system_wide = 0; |
70 | static unsigned int nr_cpus = 0; |
71 | static int run_idx = 0; |
72 | |
73 | static int run_count = 1; |
74 | static int inherit = 1; |
75 | static int scale = 1; |
76 | static pid_t target_pid = -1; |
77 | static pid_t child_pid = -1; |
78 | static int null_run = 0; |
79 | |
80 | static int fd[MAX_NR_CPUS][MAX_COUNTERS]; |
81 | |
82 | static int event_scaled[MAX_COUNTERS]; |
83 | |
84 | static volatile int done = 0; |
85 | |
86 | struct stats |
87 | { |
88 | double n, mean, M2; |
89 | }; |
90 | |
91 | static void update_stats(struct stats *stats, u64 val) |
92 | { |
93 | double delta; |
94 | |
95 | stats->n++; |
96 | delta = val - stats->mean; |
97 | stats->mean += delta / stats->n; |
98 | stats->M2 += delta*(val - stats->mean); |
99 | } |
100 | |
101 | static double avg_stats(struct stats *stats) |
102 | { |
103 | return stats->mean; |
104 | } |
105 | |
106 | /* |
107 | * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance |
108 | * |
109 | * (\Sum n_i^2) - ((\Sum n_i)^2)/n |
110 | * s^2 = ------------------------------- |
111 | * n - 1 |
112 | * |
113 | * http://en.wikipedia.org/wiki/Stddev |
114 | * |
115 | * The std dev of the mean is related to the std dev by: |
116 | * |
117 | * s |
118 | * s_mean = ------- |
119 | * sqrt(n) |
120 | * |
121 | */ |
122 | static double stddev_stats(struct stats *stats) |
123 | { |
124 | double variance = stats->M2 / (stats->n - 1); |
125 | double variance_mean = variance / stats->n; |
126 | |
127 | return sqrt(variance_mean); |
128 | } |
129 | |
130 | struct stats event_res_stats[MAX_COUNTERS][3]; |
131 | struct stats runtime_nsecs_stats; |
132 | struct stats walltime_nsecs_stats; |
133 | struct stats runtime_cycles_stats; |
134 | struct stats runtime_branches_stats; |
135 | |
136 | #define MATCH_EVENT(t, c, counter) \ |
137 | (attrs[counter].type == PERF_TYPE_##t && \ |
138 | attrs[counter].config == PERF_COUNT_##c) |
139 | |
140 | #define ERR_PERF_OPEN \ |
141 | "Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n" |
142 | |
143 | static void create_perf_stat_counter(int counter, int pid) |
144 | { |
145 | struct perf_event_attr *attr = attrs + counter; |
146 | |
147 | if (scale) |
148 | attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | |
149 | PERF_FORMAT_TOTAL_TIME_RUNNING; |
150 | |
151 | if (system_wide) { |
152 | unsigned int cpu; |
153 | |
154 | for (cpu = 0; cpu < nr_cpus; cpu++) { |
155 | fd[cpu][counter] = sys_perf_event_open(attr, -1, cpumap[cpu], -1, 0); |
156 | if (fd[cpu][counter] < 0 && verbose) |
157 | fprintf(stderr, ERR_PERF_OPEN, counter, |
158 | fd[cpu][counter], strerror(errno)); |
159 | } |
160 | } else { |
161 | attr->inherit = inherit; |
162 | attr->disabled = 1; |
163 | attr->enable_on_exec = 1; |
164 | |
165 | fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0); |
166 | if (fd[0][counter] < 0 && verbose) |
167 | fprintf(stderr, ERR_PERF_OPEN, counter, |
168 | fd[0][counter], strerror(errno)); |
169 | } |
170 | } |
171 | |
172 | /* |
173 | * Does the counter have nsecs as a unit? |
174 | */ |
175 | static inline int nsec_counter(int counter) |
176 | { |
177 | if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) || |
178 | MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) |
179 | return 1; |
180 | |
181 | return 0; |
182 | } |
183 | |
184 | /* |
185 | * Read out the results of a single counter: |
186 | */ |
187 | static void read_counter(int counter) |
188 | { |
189 | u64 count[3], single_count[3]; |
190 | unsigned int cpu; |
191 | size_t res, nv; |
192 | int scaled; |
193 | int i; |
194 | |
195 | count[0] = count[1] = count[2] = 0; |
196 | |
197 | nv = scale ? 3 : 1; |
198 | for (cpu = 0; cpu < nr_cpus; cpu++) { |
199 | if (fd[cpu][counter] < 0) |
200 | continue; |
201 | |
202 | res = read(fd[cpu][counter], single_count, nv * sizeof(u64)); |
203 | assert(res == nv * sizeof(u64)); |
204 | |
205 | close(fd[cpu][counter]); |
206 | fd[cpu][counter] = -1; |
207 | |
208 | count[0] += single_count[0]; |
209 | if (scale) { |
210 | count[1] += single_count[1]; |
211 | count[2] += single_count[2]; |
212 | } |
213 | } |
214 | |
215 | scaled = 0; |
216 | if (scale) { |
217 | if (count[2] == 0) { |
218 | event_scaled[counter] = -1; |
219 | count[0] = 0; |
220 | return; |
221 | } |
222 | |
223 | if (count[2] < count[1]) { |
224 | event_scaled[counter] = 1; |
225 | count[0] = (unsigned long long) |
226 | ((double)count[0] * count[1] / count[2] + 0.5); |
227 | } |
228 | } |
229 | |
230 | for (i = 0; i < 3; i++) |
231 | update_stats(&event_res_stats[counter][i], count[i]); |
232 | |
233 | if (verbose) { |
234 | fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter), |
235 | count[0], count[1], count[2]); |
236 | } |
237 | |
238 | /* |
239 | * Save the full runtime - to allow normalization during printout: |
240 | */ |
241 | if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) |
242 | update_stats(&runtime_nsecs_stats, count[0]); |
243 | if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter)) |
244 | update_stats(&runtime_cycles_stats, count[0]); |
245 | if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter)) |
246 | update_stats(&runtime_branches_stats, count[0]); |
247 | } |
248 | |
249 | static int run_perf_stat(int argc __used, const char **argv) |
250 | { |
251 | unsigned long long t0, t1; |
252 | int status = 0; |
253 | int counter; |
254 | int pid = target_pid; |
255 | int child_ready_pipe[2], go_pipe[2]; |
256 | const bool forks = (target_pid == -1 && argc > 0); |
257 | char buf; |
258 | |
259 | if (!system_wide) |
260 | nr_cpus = 1; |
261 | |
262 | if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) { |
263 | perror("failed to create pipes"); |
264 | exit(1); |
265 | } |
266 | |
267 | if (forks) { |
268 | if ((pid = fork()) < 0) |
269 | perror("failed to fork"); |
270 | |
271 | if (!pid) { |
272 | close(child_ready_pipe[0]); |
273 | close(go_pipe[1]); |
274 | fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); |
275 | |
276 | /* |
277 | * Do a dummy execvp to get the PLT entry resolved, |
278 | * so we avoid the resolver overhead on the real |
279 | * execvp call. |
280 | */ |
281 | execvp("", (char **)argv); |
282 | |
283 | /* |
284 | * Tell the parent we're ready to go |
285 | */ |
286 | close(child_ready_pipe[1]); |
287 | |
288 | /* |
289 | * Wait until the parent tells us to go. |
290 | */ |
291 | if (read(go_pipe[0], &buf, 1) == -1) |
292 | perror("unable to read pipe"); |
293 | |
294 | execvp(argv[0], (char **)argv); |
295 | |
296 | perror(argv[0]); |
297 | exit(-1); |
298 | } |
299 | |
300 | child_pid = pid; |
301 | |
302 | /* |
303 | * Wait for the child to be ready to exec. |
304 | */ |
305 | close(child_ready_pipe[1]); |
306 | close(go_pipe[0]); |
307 | if (read(child_ready_pipe[0], &buf, 1) == -1) |
308 | perror("unable to read pipe"); |
309 | close(child_ready_pipe[0]); |
310 | } |
311 | |
312 | for (counter = 0; counter < nr_counters; counter++) |
313 | create_perf_stat_counter(counter, pid); |
314 | |
315 | /* |
316 | * Enable counters and exec the command: |
317 | */ |
318 | t0 = rdclock(); |
319 | |
320 | if (forks) { |
321 | close(go_pipe[1]); |
322 | wait(&status); |
323 | } else { |
324 | while(!done); |
325 | } |
326 | |
327 | t1 = rdclock(); |
328 | |
329 | update_stats(&walltime_nsecs_stats, t1 - t0); |
330 | |
331 | for (counter = 0; counter < nr_counters; counter++) |
332 | read_counter(counter); |
333 | |
334 | return WEXITSTATUS(status); |
335 | } |
336 | |
337 | static void print_noise(int counter, double avg) |
338 | { |
339 | if (run_count == 1) |
340 | return; |
341 | |
342 | fprintf(stderr, " ( +- %7.3f%% )", |
343 | 100 * stddev_stats(&event_res_stats[counter][0]) / avg); |
344 | } |
345 | |
346 | static void nsec_printout(int counter, double avg) |
347 | { |
348 | double msecs = avg / 1e6; |
349 | |
350 | fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter)); |
351 | |
352 | if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) { |
353 | fprintf(stderr, " # %10.3f CPUs ", |
354 | avg / avg_stats(&walltime_nsecs_stats)); |
355 | } |
356 | } |
357 | |
358 | static void abs_printout(int counter, double avg) |
359 | { |
360 | double total, ratio = 0.0; |
361 | |
362 | fprintf(stderr, " %14.0f %-24s", avg, event_name(counter)); |
363 | |
364 | if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) { |
365 | total = avg_stats(&runtime_cycles_stats); |
366 | |
367 | if (total) |
368 | ratio = avg / total; |
369 | |
370 | fprintf(stderr, " # %10.3f IPC ", ratio); |
371 | } else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) && |
372 | runtime_branches_stats.n != 0) { |
373 | total = avg_stats(&runtime_branches_stats); |
374 | |
375 | if (total) |
376 | ratio = avg * 100 / total; |
377 | |
378 | fprintf(stderr, " # %10.3f %% ", ratio); |
379 | |
380 | } else if (runtime_nsecs_stats.n != 0) { |
381 | total = avg_stats(&runtime_nsecs_stats); |
382 | |
383 | if (total) |
384 | ratio = 1000.0 * avg / total; |
385 | |
386 | fprintf(stderr, " # %10.3f M/sec", ratio); |
387 | } |
388 | } |
389 | |
390 | /* |
391 | * Print out the results of a single counter: |
392 | */ |
393 | static void print_counter(int counter) |
394 | { |
395 | double avg = avg_stats(&event_res_stats[counter][0]); |
396 | int scaled = event_scaled[counter]; |
397 | |
398 | if (scaled == -1) { |
399 | fprintf(stderr, " %14s %-24s\n", |
400 | "<not counted>", event_name(counter)); |
401 | return; |
402 | } |
403 | |
404 | if (nsec_counter(counter)) |
405 | nsec_printout(counter, avg); |
406 | else |
407 | abs_printout(counter, avg); |
408 | |
409 | print_noise(counter, avg); |
410 | |
411 | if (scaled) { |
412 | double avg_enabled, avg_running; |
413 | |
414 | avg_enabled = avg_stats(&event_res_stats[counter][1]); |
415 | avg_running = avg_stats(&event_res_stats[counter][2]); |
416 | |
417 | fprintf(stderr, " (scaled from %.2f%%)", |
418 | 100 * avg_running / avg_enabled); |
419 | } |
420 | |
421 | fprintf(stderr, "\n"); |
422 | } |
423 | |
424 | static void print_stat(int argc, const char **argv) |
425 | { |
426 | int i, counter; |
427 | |
428 | fflush(stdout); |
429 | |
430 | fprintf(stderr, "\n"); |
431 | fprintf(stderr, " Performance counter stats for "); |
432 | if(target_pid == -1) { |
433 | fprintf(stderr, "\'%s", argv[0]); |
434 | for (i = 1; i < argc; i++) |
435 | fprintf(stderr, " %s", argv[i]); |
436 | }else |
437 | fprintf(stderr, "task pid \'%d", target_pid); |
438 | |
439 | fprintf(stderr, "\'"); |
440 | if (run_count > 1) |
441 | fprintf(stderr, " (%d runs)", run_count); |
442 | fprintf(stderr, ":\n\n"); |
443 | |
444 | for (counter = 0; counter < nr_counters; counter++) |
445 | print_counter(counter); |
446 | |
447 | fprintf(stderr, "\n"); |
448 | fprintf(stderr, " %14.9f seconds time elapsed", |
449 | avg_stats(&walltime_nsecs_stats)/1e9); |
450 | if (run_count > 1) { |
451 | fprintf(stderr, " ( +- %7.3f%% )", |
452 | 100*stddev_stats(&walltime_nsecs_stats) / |
453 | avg_stats(&walltime_nsecs_stats)); |
454 | } |
455 | fprintf(stderr, "\n\n"); |
456 | } |
457 | |
458 | static volatile int signr = -1; |
459 | |
460 | static void skip_signal(int signo) |
461 | { |
462 | if(target_pid != -1) |
463 | done = 1; |
464 | |
465 | signr = signo; |
466 | } |
467 | |
468 | static void sig_atexit(void) |
469 | { |
470 | if (child_pid != -1) |
471 | kill(child_pid, SIGTERM); |
472 | |
473 | if (signr == -1) |
474 | return; |
475 | |
476 | signal(signr, SIG_DFL); |
477 | kill(getpid(), signr); |
478 | } |
479 | |
480 | static const char * const stat_usage[] = { |
481 | "perf stat [<options>] [<command>]", |
482 | NULL |
483 | }; |
484 | |
485 | static const struct option options[] = { |
486 | OPT_CALLBACK('e', "event", NULL, "event", |
487 | "event selector. use 'perf list' to list available events", |
488 | parse_events), |
489 | OPT_BOOLEAN('i', "inherit", &inherit, |
490 | "child tasks inherit counters"), |
491 | OPT_INTEGER('p', "pid", &target_pid, |
492 | "stat events on existing pid"), |
493 | OPT_BOOLEAN('a', "all-cpus", &system_wide, |
494 | "system-wide collection from all CPUs"), |
495 | OPT_BOOLEAN('c', "scale", &scale, |
496 | "scale/normalize counters"), |
497 | OPT_BOOLEAN('v', "verbose", &verbose, |
498 | "be more verbose (show counter open errors, etc)"), |
499 | OPT_INTEGER('r', "repeat", &run_count, |
500 | "repeat command and print average + stddev (max: 100)"), |
501 | OPT_BOOLEAN('n', "null", &null_run, |
502 | "null run - dont start any counters"), |
503 | OPT_END() |
504 | }; |
505 | |
506 | int cmd_stat(int argc, const char **argv, const char *prefix __used) |
507 | { |
508 | int status; |
509 | |
510 | argc = parse_options(argc, argv, options, stat_usage, |
511 | PARSE_OPT_STOP_AT_NON_OPTION); |
512 | if (!argc && target_pid == -1) |
513 | usage_with_options(stat_usage, options); |
514 | if (run_count <= 0) |
515 | usage_with_options(stat_usage, options); |
516 | |
517 | /* Set attrs and nr_counters if no event is selected and !null_run */ |
518 | if (!null_run && !nr_counters) { |
519 | memcpy(attrs, default_attrs, sizeof(default_attrs)); |
520 | nr_counters = ARRAY_SIZE(default_attrs); |
521 | } |
522 | |
523 | if (system_wide) |
524 | nr_cpus = read_cpu_map(); |
525 | else |
526 | nr_cpus = 1; |
527 | |
528 | /* |
529 | * We dont want to block the signals - that would cause |
530 | * child tasks to inherit that and Ctrl-C would not work. |
531 | * What we want is for Ctrl-C to work in the exec()-ed |
532 | * task, but being ignored by perf stat itself: |
533 | */ |
534 | atexit(sig_atexit); |
535 | signal(SIGINT, skip_signal); |
536 | signal(SIGALRM, skip_signal); |
537 | signal(SIGABRT, skip_signal); |
538 | |
539 | status = 0; |
540 | for (run_idx = 0; run_idx < run_count; run_idx++) { |
541 | if (run_count != 1 && verbose) |
542 | fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); |
543 | status = run_perf_stat(argc, argv); |
544 | } |
545 | |
546 | print_stat(argc, argv); |
547 | |
548 | return status; |
549 | } |
550 |
Branches:
ben-wpan
ben-wpan-stefan
javiroman/ks7010
jz-2.6.34
jz-2.6.34-rc5
jz-2.6.34-rc6
jz-2.6.34-rc7
jz-2.6.35
jz-2.6.36
jz-2.6.37
jz-2.6.38
jz-2.6.39
jz-3.0
jz-3.1
jz-3.11
jz-3.12
jz-3.13
jz-3.15
jz-3.16
jz-3.18-dt
jz-3.2
jz-3.3
jz-3.4
jz-3.5
jz-3.6
jz-3.6-rc2-pwm
jz-3.9
jz-3.9-clk
jz-3.9-rc8
jz47xx
jz47xx-2.6.38
master
Tags:
od-2011-09-04
od-2011-09-18
v2.6.34-rc5
v2.6.34-rc6
v2.6.34-rc7
v3.9