Root/tools/perf/builtin-stat.c

1/*
2 * builtin-stat.c
3 *
4 * Builtin stat command: Give a precise performance counters summary
5 * overview about any workload, CPU or specific PID.
6 *
7 * Sample output:
8
9   $ perf stat ./hackbench 10
10
11  Time: 0.118
12
13  Performance counter stats for './hackbench 10':
14
15       1708.761321 task-clock # 11.037 CPUs utilized
16            41,190 context-switches # 0.024 M/sec
17             6,735 CPU-migrations # 0.004 M/sec
18            17,318 page-faults # 0.010 M/sec
19     5,205,202,243 cycles # 3.046 GHz
20     3,856,436,920 stalled-cycles-frontend # 74.09% frontend cycles idle
21     1,600,790,871 stalled-cycles-backend # 30.75% backend cycles idle
22     2,603,501,247 instructions # 0.50 insns per cycle
23                                             # 1.48 stalled cycles per insn
24       484,357,498 branches # 283.455 M/sec
25         6,388,934 branch-misses # 1.32% of all branches
26
27        0.154822978 seconds time elapsed
28
29 *
30 * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
31 *
32 * Improvements and fixes by:
33 *
34 * Arjan van de Ven <arjan@linux.intel.com>
35 * Yanmin Zhang <yanmin.zhang@intel.com>
36 * Wu Fengguang <fengguang.wu@intel.com>
37 * Mike Galbraith <efault@gmx.de>
38 * Paul Mackerras <paulus@samba.org>
39 * Jaswinder Singh Rajput <jaswinder@kernel.org>
40 *
41 * Released under the GPL v2. (and only v2, not any later version)
42 */
43
44#include "perf.h"
45#include "builtin.h"
46#include "util/util.h"
47#include "util/parse-options.h"
48#include "util/parse-events.h"
49#include "util/event.h"
50#include "util/evlist.h"
51#include "util/evsel.h"
52#include "util/debug.h"
53#include "util/color.h"
54#include "util/header.h"
55#include "util/cpumap.h"
56#include "util/thread.h"
57#include "util/thread_map.h"
58
59#include <sys/prctl.h>
60#include <math.h>
61#include <locale.h>
62
63#define DEFAULT_SEPARATOR " "
64
65static struct perf_event_attr default_attrs[] = {
66
67  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
68  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES },
69  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
70  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
71
72  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
73  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
74  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
75  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
76  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
77  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES },
78
79};
80
81/*
82 * Detailed stats (-d), covering the L1 and last level data caches:
83 */
84static struct perf_event_attr detailed_attrs[] = {
85
86  { .type = PERF_TYPE_HW_CACHE,
87    .config =
88     PERF_COUNT_HW_CACHE_L1D << 0 |
89    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
90    (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
91
92  { .type = PERF_TYPE_HW_CACHE,
93    .config =
94     PERF_COUNT_HW_CACHE_L1D << 0 |
95    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
96    (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
97
98  { .type = PERF_TYPE_HW_CACHE,
99    .config =
100     PERF_COUNT_HW_CACHE_LL << 0 |
101    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
102    (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
103
104  { .type = PERF_TYPE_HW_CACHE,
105    .config =
106     PERF_COUNT_HW_CACHE_LL << 0 |
107    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
108    (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
109};
110
111/*
112 * Very detailed stats (-d -d), covering the instruction cache and the TLB caches:
113 */
114static struct perf_event_attr very_detailed_attrs[] = {
115
116  { .type = PERF_TYPE_HW_CACHE,
117    .config =
118     PERF_COUNT_HW_CACHE_L1I << 0 |
119    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
120    (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
121
122  { .type = PERF_TYPE_HW_CACHE,
123    .config =
124     PERF_COUNT_HW_CACHE_L1I << 0 |
125    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
126    (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
127
128  { .type = PERF_TYPE_HW_CACHE,
129    .config =
130     PERF_COUNT_HW_CACHE_DTLB << 0 |
131    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
132    (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
133
134  { .type = PERF_TYPE_HW_CACHE,
135    .config =
136     PERF_COUNT_HW_CACHE_DTLB << 0 |
137    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
138    (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
139
140  { .type = PERF_TYPE_HW_CACHE,
141    .config =
142     PERF_COUNT_HW_CACHE_ITLB << 0 |
143    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
144    (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
145
146  { .type = PERF_TYPE_HW_CACHE,
147    .config =
148     PERF_COUNT_HW_CACHE_ITLB << 0 |
149    (PERF_COUNT_HW_CACHE_OP_READ << 8) |
150    (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
151
152};
153
154/*
155 * Very, very detailed stats (-d -d -d), adding prefetch events:
156 */
157static struct perf_event_attr very_very_detailed_attrs[] = {
158
159  { .type = PERF_TYPE_HW_CACHE,
160    .config =
161     PERF_COUNT_HW_CACHE_L1D << 0 |
162    (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
163    (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) },
164
165  { .type = PERF_TYPE_HW_CACHE,
166    .config =
167     PERF_COUNT_HW_CACHE_L1D << 0 |
168    (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
169    (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) },
170};
171
172
173
174struct perf_evlist *evsel_list;
175
176static bool system_wide = false;
177static int run_idx = 0;
178
179static int run_count = 1;
180static bool no_inherit = false;
181static bool scale = true;
182static bool no_aggr = false;
183static pid_t target_pid = -1;
184static pid_t target_tid = -1;
185static pid_t child_pid = -1;
186static bool null_run = false;
187static int detailed_run = 0;
188static bool sync_run = false;
189static bool big_num = true;
190static int big_num_opt = -1;
191static const char *cpu_list;
192static const char *csv_sep = NULL;
193static bool csv_output = false;
194
195static volatile int done = 0;
196
197struct stats
198{
199    double n, mean, M2;
200};
201
202struct perf_stat {
203    struct stats res_stats[3];
204};
205
206static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
207{
208    evsel->priv = zalloc(sizeof(struct perf_stat));
209    return evsel->priv == NULL ? -ENOMEM : 0;
210}
211
212static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
213{
214    free(evsel->priv);
215    evsel->priv = NULL;
216}
217
218static void update_stats(struct stats *stats, u64 val)
219{
220    double delta;
221
222    stats->n++;
223    delta = val - stats->mean;
224    stats->mean += delta / stats->n;
225    stats->M2 += delta*(val - stats->mean);
226}
227
228static double avg_stats(struct stats *stats)
229{
230    return stats->mean;
231}
232
233/*
234 * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
235 *
236 * (\Sum n_i^2) - ((\Sum n_i)^2)/n
237 * s^2 = -------------------------------
238 * n - 1
239 *
240 * http://en.wikipedia.org/wiki/Stddev
241 *
242 * The std dev of the mean is related to the std dev by:
243 *
244 * s
245 * s_mean = -------
246 * sqrt(n)
247 *
248 */
249static double stddev_stats(struct stats *stats)
250{
251    double variance = stats->M2 / (stats->n - 1);
252    double variance_mean = variance / stats->n;
253
254    return sqrt(variance_mean);
255}
256
257struct stats runtime_nsecs_stats[MAX_NR_CPUS];
258struct stats runtime_cycles_stats[MAX_NR_CPUS];
259struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
260struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
261struct stats runtime_branches_stats[MAX_NR_CPUS];
262struct stats runtime_cacherefs_stats[MAX_NR_CPUS];
263struct stats runtime_l1_dcache_stats[MAX_NR_CPUS];
264struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
265struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
266struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
267struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
268struct stats walltime_nsecs_stats;
269
270static int create_perf_stat_counter(struct perf_evsel *evsel)
271{
272    struct perf_event_attr *attr = &evsel->attr;
273
274    if (scale)
275        attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
276                    PERF_FORMAT_TOTAL_TIME_RUNNING;
277
278    attr->inherit = !no_inherit;
279
280    if (system_wide)
281        return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false);
282
283    if (target_pid == -1 && target_tid == -1) {
284        attr->disabled = 1;
285        attr->enable_on_exec = 1;
286    }
287
288    return perf_evsel__open_per_thread(evsel, evsel_list->threads, false);
289}
290
291/*
292 * Does the counter have nsecs as a unit?
293 */
294static inline int nsec_counter(struct perf_evsel *evsel)
295{
296    if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) ||
297        perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
298        return 1;
299
300    return 0;
301}
302
303/*
304 * Update various tracking values we maintain to print
305 * more semantic information such as miss/hit ratios,
306 * instruction rates, etc:
307 */
308static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
309{
310    if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
311        update_stats(&runtime_nsecs_stats[0], count[0]);
312    else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
313        update_stats(&runtime_cycles_stats[0], count[0]);
314    else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
315        update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
316    else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
317        update_stats(&runtime_stalled_cycles_back_stats[0], count[0]);
318    else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
319        update_stats(&runtime_branches_stats[0], count[0]);
320    else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
321        update_stats(&runtime_cacherefs_stats[0], count[0]);
322    else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
323        update_stats(&runtime_l1_dcache_stats[0], count[0]);
324    else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
325        update_stats(&runtime_l1_icache_stats[0], count[0]);
326    else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
327        update_stats(&runtime_ll_cache_stats[0], count[0]);
328    else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
329        update_stats(&runtime_dtlb_cache_stats[0], count[0]);
330    else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
331        update_stats(&runtime_itlb_cache_stats[0], count[0]);
332}
333
334/*
335 * Read out the results of a single counter:
336 * aggregate counts across CPUs in system-wide mode
337 */
338static int read_counter_aggr(struct perf_evsel *counter)
339{
340    struct perf_stat *ps = counter->priv;
341    u64 *count = counter->counts->aggr.values;
342    int i;
343
344    if (__perf_evsel__read(counter, evsel_list->cpus->nr,
345                   evsel_list->threads->nr, scale) < 0)
346        return -1;
347
348    for (i = 0; i < 3; i++)
349        update_stats(&ps->res_stats[i], count[i]);
350
351    if (verbose) {
352        fprintf(stderr, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
353            event_name(counter), count[0], count[1], count[2]);
354    }
355
356    /*
357     * Save the full runtime - to allow normalization during printout:
358     */
359    update_shadow_stats(counter, count);
360
361    return 0;
362}
363
364/*
365 * Read out the results of a single counter:
366 * do not aggregate counts across CPUs in system-wide mode
367 */
368static int read_counter(struct perf_evsel *counter)
369{
370    u64 *count;
371    int cpu;
372
373    for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
374        if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
375            return -1;
376
377        count = counter->counts->cpu[cpu].values;
378
379        update_shadow_stats(counter, count);
380    }
381
382    return 0;
383}
384
385static int run_perf_stat(int argc __used, const char **argv)
386{
387    unsigned long long t0, t1;
388    struct perf_evsel *counter;
389    int status = 0;
390    int child_ready_pipe[2], go_pipe[2];
391    const bool forks = (argc > 0);
392    char buf;
393
394    if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
395        perror("failed to create pipes");
396        exit(1);
397    }
398
399    if (forks) {
400        if ((child_pid = fork()) < 0)
401            perror("failed to fork");
402
403        if (!child_pid) {
404            close(child_ready_pipe[0]);
405            close(go_pipe[1]);
406            fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
407
408            /*
409             * Do a dummy execvp to get the PLT entry resolved,
410             * so we avoid the resolver overhead on the real
411             * execvp call.
412             */
413            execvp("", (char **)argv);
414
415            /*
416             * Tell the parent we're ready to go
417             */
418            close(child_ready_pipe[1]);
419
420            /*
421             * Wait until the parent tells us to go.
422             */
423            if (read(go_pipe[0], &buf, 1) == -1)
424                perror("unable to read pipe");
425
426            execvp(argv[0], (char **)argv);
427
428            perror(argv[0]);
429            exit(-1);
430        }
431
432        if (target_tid == -1 && target_pid == -1 && !system_wide)
433            evsel_list->threads->map[0] = child_pid;
434
435        /*
436         * Wait for the child to be ready to exec.
437         */
438        close(child_ready_pipe[1]);
439        close(go_pipe[0]);
440        if (read(child_ready_pipe[0], &buf, 1) == -1)
441            perror("unable to read pipe");
442        close(child_ready_pipe[0]);
443    }
444
445    list_for_each_entry(counter, &evsel_list->entries, node) {
446        if (create_perf_stat_counter(counter) < 0) {
447            if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) {
448                if (verbose)
449                    ui__warning("%s event is not supported by the kernel.\n",
450                            event_name(counter));
451                continue;
452            }
453
454            if (errno == EPERM || errno == EACCES) {
455                error("You may not have permission to collect %sstats.\n"
456                      "\t Consider tweaking"
457                      " /proc/sys/kernel/perf_event_paranoid or running as root.",
458                      system_wide ? "system-wide " : "");
459            } else {
460                error("open_counter returned with %d (%s). "
461                      "/bin/dmesg may provide additional information.\n",
462                       errno, strerror(errno));
463            }
464            if (child_pid != -1)
465                kill(child_pid, SIGTERM);
466            die("Not all events could be opened.\n");
467            return -1;
468        }
469    }
470
471    if (perf_evlist__set_filters(evsel_list)) {
472        error("failed to set filter with %d (%s)\n", errno,
473            strerror(errno));
474        return -1;
475    }
476
477    /*
478     * Enable counters and exec the command:
479     */
480    t0 = rdclock();
481
482    if (forks) {
483        close(go_pipe[1]);
484        wait(&status);
485    } else {
486        while(!done) sleep(1);
487    }
488
489    t1 = rdclock();
490
491    update_stats(&walltime_nsecs_stats, t1 - t0);
492
493    if (no_aggr) {
494        list_for_each_entry(counter, &evsel_list->entries, node) {
495            read_counter(counter);
496            perf_evsel__close_fd(counter, evsel_list->cpus->nr, 1);
497        }
498    } else {
499        list_for_each_entry(counter, &evsel_list->entries, node) {
500            read_counter_aggr(counter);
501            perf_evsel__close_fd(counter, evsel_list->cpus->nr,
502                         evsel_list->threads->nr);
503        }
504    }
505
506    return WEXITSTATUS(status);
507}
508
509static void print_noise_pct(double total, double avg)
510{
511    double pct = 0.0;
512
513    if (avg)
514        pct = 100.0*total/avg;
515
516    fprintf(stderr, " ( +-%6.2f%% )", pct);
517}
518
519static void print_noise(struct perf_evsel *evsel, double avg)
520{
521    struct perf_stat *ps;
522
523    if (run_count == 1)
524        return;
525
526    ps = evsel->priv;
527    print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
528}
529
530static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg)
531{
532    double msecs = avg / 1e6;
533    char cpustr[16] = { '\0', };
534    const char *fmt = csv_output ? "%s%.6f%s%s" : "%s%18.6f%s%-25s";
535
536    if (no_aggr)
537        sprintf(cpustr, "CPU%*d%s",
538            csv_output ? 0 : -4,
539            evsel_list->cpus->map[cpu], csv_sep);
540
541    fprintf(stderr, fmt, cpustr, msecs, csv_sep, event_name(evsel));
542
543    if (evsel->cgrp)
544        fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
545
546    if (csv_output)
547        return;
548
549    if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
550        fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats));
551}
552
553static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg)
554{
555    double total, ratio = 0.0;
556    const char *color;
557
558    total = avg_stats(&runtime_cycles_stats[cpu]);
559
560    if (total)
561        ratio = avg / total * 100.0;
562
563    color = PERF_COLOR_NORMAL;
564    if (ratio > 50.0)
565        color = PERF_COLOR_RED;
566    else if (ratio > 30.0)
567        color = PERF_COLOR_MAGENTA;
568    else if (ratio > 10.0)
569        color = PERF_COLOR_YELLOW;
570
571    fprintf(stderr, " # ");
572    color_fprintf(stderr, color, "%6.2f%%", ratio);
573    fprintf(stderr, " frontend cycles idle ");
574}
575
576static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg)
577{
578    double total, ratio = 0.0;
579    const char *color;
580
581    total = avg_stats(&runtime_cycles_stats[cpu]);
582
583    if (total)
584        ratio = avg / total * 100.0;
585
586    color = PERF_COLOR_NORMAL;
587    if (ratio > 75.0)
588        color = PERF_COLOR_RED;
589    else if (ratio > 50.0)
590        color = PERF_COLOR_MAGENTA;
591    else if (ratio > 20.0)
592        color = PERF_COLOR_YELLOW;
593
594    fprintf(stderr, " # ");
595    color_fprintf(stderr, color, "%6.2f%%", ratio);
596    fprintf(stderr, " backend cycles idle ");
597}
598
599static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg)
600{
601    double total, ratio = 0.0;
602    const char *color;
603
604    total = avg_stats(&runtime_branches_stats[cpu]);
605
606    if (total)
607        ratio = avg / total * 100.0;
608
609    color = PERF_COLOR_NORMAL;
610    if (ratio > 20.0)
611        color = PERF_COLOR_RED;
612    else if (ratio > 10.0)
613        color = PERF_COLOR_MAGENTA;
614    else if (ratio > 5.0)
615        color = PERF_COLOR_YELLOW;
616
617    fprintf(stderr, " # ");
618    color_fprintf(stderr, color, "%6.2f%%", ratio);
619    fprintf(stderr, " of all branches ");
620}
621
622static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
623{
624    double total, ratio = 0.0;
625    const char *color;
626
627    total = avg_stats(&runtime_l1_dcache_stats[cpu]);
628
629    if (total)
630        ratio = avg / total * 100.0;
631
632    color = PERF_COLOR_NORMAL;
633    if (ratio > 20.0)
634        color = PERF_COLOR_RED;
635    else if (ratio > 10.0)
636        color = PERF_COLOR_MAGENTA;
637    else if (ratio > 5.0)
638        color = PERF_COLOR_YELLOW;
639
640    fprintf(stderr, " # ");
641    color_fprintf(stderr, color, "%6.2f%%", ratio);
642    fprintf(stderr, " of all L1-dcache hits ");
643}
644
645static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
646{
647    double total, ratio = 0.0;
648    const char *color;
649
650    total = avg_stats(&runtime_l1_icache_stats[cpu]);
651
652    if (total)
653        ratio = avg / total * 100.0;
654
655    color = PERF_COLOR_NORMAL;
656    if (ratio > 20.0)
657        color = PERF_COLOR_RED;
658    else if (ratio > 10.0)
659        color = PERF_COLOR_MAGENTA;
660    else if (ratio > 5.0)
661        color = PERF_COLOR_YELLOW;
662
663    fprintf(stderr, " # ");
664    color_fprintf(stderr, color, "%6.2f%%", ratio);
665    fprintf(stderr, " of all L1-icache hits ");
666}
667
668static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
669{
670    double total, ratio = 0.0;
671    const char *color;
672
673    total = avg_stats(&runtime_dtlb_cache_stats[cpu]);
674
675    if (total)
676        ratio = avg / total * 100.0;
677
678    color = PERF_COLOR_NORMAL;
679    if (ratio > 20.0)
680        color = PERF_COLOR_RED;
681    else if (ratio > 10.0)
682        color = PERF_COLOR_MAGENTA;
683    else if (ratio > 5.0)
684        color = PERF_COLOR_YELLOW;
685
686    fprintf(stderr, " # ");
687    color_fprintf(stderr, color, "%6.2f%%", ratio);
688    fprintf(stderr, " of all dTLB cache hits ");
689}
690
691static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
692{
693    double total, ratio = 0.0;
694    const char *color;
695
696    total = avg_stats(&runtime_itlb_cache_stats[cpu]);
697
698    if (total)
699        ratio = avg / total * 100.0;
700
701    color = PERF_COLOR_NORMAL;
702    if (ratio > 20.0)
703        color = PERF_COLOR_RED;
704    else if (ratio > 10.0)
705        color = PERF_COLOR_MAGENTA;
706    else if (ratio > 5.0)
707        color = PERF_COLOR_YELLOW;
708
709    fprintf(stderr, " # ");
710    color_fprintf(stderr, color, "%6.2f%%", ratio);
711    fprintf(stderr, " of all iTLB cache hits ");
712}
713
714static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, double avg)
715{
716    double total, ratio = 0.0;
717    const char *color;
718
719    total = avg_stats(&runtime_ll_cache_stats[cpu]);
720
721    if (total)
722        ratio = avg / total * 100.0;
723
724    color = PERF_COLOR_NORMAL;
725    if (ratio > 20.0)
726        color = PERF_COLOR_RED;
727    else if (ratio > 10.0)
728        color = PERF_COLOR_MAGENTA;
729    else if (ratio > 5.0)
730        color = PERF_COLOR_YELLOW;
731
732    fprintf(stderr, " # ");
733    color_fprintf(stderr, color, "%6.2f%%", ratio);
734    fprintf(stderr, " of all LL-cache hits ");
735}
736
737static void abs_printout(int cpu, struct perf_evsel *evsel, double avg)
738{
739    double total, ratio = 0.0;
740    char cpustr[16] = { '\0', };
741    const char *fmt;
742
743    if (csv_output)
744        fmt = "%s%.0f%s%s";
745    else if (big_num)
746        fmt = "%s%'18.0f%s%-25s";
747    else
748        fmt = "%s%18.0f%s%-25s";
749
750    if (no_aggr)
751        sprintf(cpustr, "CPU%*d%s",
752            csv_output ? 0 : -4,
753            evsel_list->cpus->map[cpu], csv_sep);
754    else
755        cpu = 0;
756
757    fprintf(stderr, fmt, cpustr, avg, csv_sep, event_name(evsel));
758
759    if (evsel->cgrp)
760        fprintf(stderr, "%s%s", csv_sep, evsel->cgrp->name);
761
762    if (csv_output)
763        return;
764
765    if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
766        total = avg_stats(&runtime_cycles_stats[cpu]);
767
768        if (total)
769            ratio = avg / total;
770
771        fprintf(stderr, " # %5.2f insns per cycle ", ratio);
772
773        total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
774        total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
775
776        if (total && avg) {
777            ratio = total / avg;
778            fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio);
779        }
780
781    } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
782            runtime_branches_stats[cpu].n != 0) {
783        print_branch_misses(cpu, evsel, avg);
784    } else if (
785        evsel->attr.type == PERF_TYPE_HW_CACHE &&
786        evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D |
787                    ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
788                    ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
789            runtime_l1_dcache_stats[cpu].n != 0) {
790        print_l1_dcache_misses(cpu, evsel, avg);
791    } else if (
792        evsel->attr.type == PERF_TYPE_HW_CACHE &&
793        evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1I |
794                    ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
795                    ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
796            runtime_l1_icache_stats[cpu].n != 0) {
797        print_l1_icache_misses(cpu, evsel, avg);
798    } else if (
799        evsel->attr.type == PERF_TYPE_HW_CACHE &&
800        evsel->attr.config == ( PERF_COUNT_HW_CACHE_DTLB |
801                    ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
802                    ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
803            runtime_dtlb_cache_stats[cpu].n != 0) {
804        print_dtlb_cache_misses(cpu, evsel, avg);
805    } else if (
806        evsel->attr.type == PERF_TYPE_HW_CACHE &&
807        evsel->attr.config == ( PERF_COUNT_HW_CACHE_ITLB |
808                    ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
809                    ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
810            runtime_itlb_cache_stats[cpu].n != 0) {
811        print_itlb_cache_misses(cpu, evsel, avg);
812    } else if (
813        evsel->attr.type == PERF_TYPE_HW_CACHE &&
814        evsel->attr.config == ( PERF_COUNT_HW_CACHE_LL |
815                    ((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
816                    ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
817            runtime_ll_cache_stats[cpu].n != 0) {
818        print_ll_cache_misses(cpu, evsel, avg);
819    } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
820            runtime_cacherefs_stats[cpu].n != 0) {
821        total = avg_stats(&runtime_cacherefs_stats[cpu]);
822
823        if (total)
824            ratio = avg * 100 / total;
825
826        fprintf(stderr, " # %8.3f %% of all cache refs ", ratio);
827
828    } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
829        print_stalled_cycles_frontend(cpu, evsel, avg);
830    } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
831        print_stalled_cycles_backend(cpu, evsel, avg);
832    } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
833        total = avg_stats(&runtime_nsecs_stats[cpu]);
834
835        if (total)
836            ratio = 1.0 * avg / total;
837
838        fprintf(stderr, " # %8.3f GHz ", ratio);
839    } else if (runtime_nsecs_stats[cpu].n != 0) {
840        total = avg_stats(&runtime_nsecs_stats[cpu]);
841
842        if (total)
843            ratio = 1000.0 * avg / total;
844
845        fprintf(stderr, " # %8.3f M/sec ", ratio);
846    } else {
847        fprintf(stderr, " ");
848    }
849}
850
851/*
852 * Print out the results of a single counter:
853 * aggregated counts in system-wide mode
854 */
855static void print_counter_aggr(struct perf_evsel *counter)
856{
857    struct perf_stat *ps = counter->priv;
858    double avg = avg_stats(&ps->res_stats[0]);
859    int scaled = counter->counts->scaled;
860
861    if (scaled == -1) {
862        fprintf(stderr, "%*s%s%*s",
863            csv_output ? 0 : 18,
864            "<not counted>",
865            csv_sep,
866            csv_output ? 0 : -24,
867            event_name(counter));
868
869        if (counter->cgrp)
870            fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
871
872        fputc('\n', stderr);
873        return;
874    }
875
876    if (nsec_counter(counter))
877        nsec_printout(-1, counter, avg);
878    else
879        abs_printout(-1, counter, avg);
880
881    if (csv_output) {
882        fputc('\n', stderr);
883        return;
884    }
885
886    print_noise(counter, avg);
887
888    if (scaled) {
889        double avg_enabled, avg_running;
890
891        avg_enabled = avg_stats(&ps->res_stats[1]);
892        avg_running = avg_stats(&ps->res_stats[2]);
893
894        fprintf(stderr, " [%5.2f%%]", 100 * avg_running / avg_enabled);
895    }
896    fprintf(stderr, "\n");
897}
898
899/*
900 * Print out the results of a single counter:
901 * does not use aggregated count in system-wide
902 */
903static void print_counter(struct perf_evsel *counter)
904{
905    u64 ena, run, val;
906    int cpu;
907
908    for (cpu = 0; cpu < evsel_list->cpus->nr; cpu++) {
909        val = counter->counts->cpu[cpu].val;
910        ena = counter->counts->cpu[cpu].ena;
911        run = counter->counts->cpu[cpu].run;
912        if (run == 0 || ena == 0) {
913            fprintf(stderr, "CPU%*d%s%*s%s%*s",
914                csv_output ? 0 : -4,
915                evsel_list->cpus->map[cpu], csv_sep,
916                csv_output ? 0 : 18,
917                "<not counted>", csv_sep,
918                csv_output ? 0 : -24,
919                event_name(counter));
920
921            if (counter->cgrp)
922                fprintf(stderr, "%s%s", csv_sep, counter->cgrp->name);
923
924            fputc('\n', stderr);
925            continue;
926        }
927
928        if (nsec_counter(counter))
929            nsec_printout(cpu, counter, val);
930        else
931            abs_printout(cpu, counter, val);
932
933        if (!csv_output) {
934            print_noise(counter, 1.0);
935
936            if (run != ena)
937                fprintf(stderr, " (%.2f%%)", 100.0 * run / ena);
938        }
939        fputc('\n', stderr);
940    }
941}
942
943static void print_stat(int argc, const char **argv)
944{
945    struct perf_evsel *counter;
946    int i;
947
948    fflush(stdout);
949
950    if (!csv_output) {
951        fprintf(stderr, "\n");
952        fprintf(stderr, " Performance counter stats for ");
953        if(target_pid == -1 && target_tid == -1) {
954            fprintf(stderr, "\'%s", argv[0]);
955            for (i = 1; i < argc; i++)
956                fprintf(stderr, " %s", argv[i]);
957        } else if (target_pid != -1)
958            fprintf(stderr, "process id \'%d", target_pid);
959        else
960            fprintf(stderr, "thread id \'%d", target_tid);
961
962        fprintf(stderr, "\'");
963        if (run_count > 1)
964            fprintf(stderr, " (%d runs)", run_count);
965        fprintf(stderr, ":\n\n");
966    }
967
968    if (no_aggr) {
969        list_for_each_entry(counter, &evsel_list->entries, node)
970            print_counter(counter);
971    } else {
972        list_for_each_entry(counter, &evsel_list->entries, node)
973            print_counter_aggr(counter);
974    }
975
976    if (!csv_output) {
977        if (!null_run)
978            fprintf(stderr, "\n");
979        fprintf(stderr, " %17.9f seconds time elapsed",
980                avg_stats(&walltime_nsecs_stats)/1e9);
981        if (run_count > 1) {
982            fprintf(stderr, " ");
983            print_noise_pct(stddev_stats(&walltime_nsecs_stats),
984                    avg_stats(&walltime_nsecs_stats));
985        }
986        fprintf(stderr, "\n\n");
987    }
988}
989
990static volatile int signr = -1;
991
992static void skip_signal(int signo)
993{
994    if(child_pid == -1)
995        done = 1;
996
997    signr = signo;
998}
999
1000static void sig_atexit(void)
1001{
1002    if (child_pid != -1)
1003        kill(child_pid, SIGTERM);
1004
1005    if (signr == -1)
1006        return;
1007
1008    signal(signr, SIG_DFL);
1009    kill(getpid(), signr);
1010}
1011
1012static const char * const stat_usage[] = {
1013    "perf stat [<options>] [<command>]",
1014    NULL
1015};
1016
1017static int stat__set_big_num(const struct option *opt __used,
1018                 const char *s __used, int unset)
1019{
1020    big_num_opt = unset ? 0 : 1;
1021    return 0;
1022}
1023
1024static const struct option options[] = {
1025    OPT_CALLBACK('e', "event", &evsel_list, "event",
1026             "event selector. use 'perf list' to list available events",
1027             parse_events),
1028    OPT_CALLBACK(0, "filter", &evsel_list, "filter",
1029             "event filter", parse_filter),
1030    OPT_BOOLEAN('i', "no-inherit", &no_inherit,
1031            "child tasks do not inherit counters"),
1032    OPT_INTEGER('p', "pid", &target_pid,
1033            "stat events on existing process id"),
1034    OPT_INTEGER('t', "tid", &target_tid,
1035            "stat events on existing thread id"),
1036    OPT_BOOLEAN('a', "all-cpus", &system_wide,
1037            "system-wide collection from all CPUs"),
1038    OPT_BOOLEAN('c', "scale", &scale,
1039            "scale/normalize counters"),
1040    OPT_INCR('v', "verbose", &verbose,
1041            "be more verbose (show counter open errors, etc)"),
1042    OPT_INTEGER('r', "repeat", &run_count,
1043            "repeat command and print average + stddev (max: 100)"),
1044    OPT_BOOLEAN('n', "null", &null_run,
1045            "null run - dont start any counters"),
1046    OPT_INCR('d', "detailed", &detailed_run,
1047            "detailed run - start a lot of events"),
1048    OPT_BOOLEAN('S', "sync", &sync_run,
1049            "call sync() before starting a run"),
1050    OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL,
1051               "print large numbers with thousands\' separators",
1052               stat__set_big_num),
1053    OPT_STRING('C', "cpu", &cpu_list, "cpu",
1054            "list of cpus to monitor in system-wide"),
1055    OPT_BOOLEAN('A', "no-aggr", &no_aggr,
1056            "disable CPU count aggregation"),
1057    OPT_STRING('x', "field-separator", &csv_sep, "separator",
1058           "print counts with custom separator"),
1059    OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
1060             "monitor event in cgroup name only",
1061             parse_cgroups),
1062    OPT_END()
1063};
1064
1065/*
1066 * Add default attributes, if there were no attributes specified or
1067 * if -d/--detailed, -d -d or -d -d -d is used:
1068 */
1069static int add_default_attributes(void)
1070{
1071    struct perf_evsel *pos;
1072    size_t attr_nr = 0;
1073    size_t c;
1074
1075    /* Set attrs if no event is selected and !null_run: */
1076    if (null_run)
1077        return 0;
1078
1079    if (!evsel_list->nr_entries) {
1080        for (c = 0; c < ARRAY_SIZE(default_attrs); c++) {
1081            pos = perf_evsel__new(default_attrs + c, c + attr_nr);
1082            if (pos == NULL)
1083                return -1;
1084            perf_evlist__add(evsel_list, pos);
1085        }
1086        attr_nr += c;
1087    }
1088
1089    /* Detailed events get appended to the event list: */
1090
1091    if (detailed_run < 1)
1092        return 0;
1093
1094    /* Append detailed run extra attributes: */
1095    for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) {
1096        pos = perf_evsel__new(detailed_attrs + c, c + attr_nr);
1097        if (pos == NULL)
1098            return -1;
1099        perf_evlist__add(evsel_list, pos);
1100    }
1101    attr_nr += c;
1102
1103    if (detailed_run < 2)
1104        return 0;
1105
1106    /* Append very detailed run extra attributes: */
1107    for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) {
1108        pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr);
1109        if (pos == NULL)
1110            return -1;
1111        perf_evlist__add(evsel_list, pos);
1112    }
1113
1114    if (detailed_run < 3)
1115        return 0;
1116
1117    /* Append very, very detailed run extra attributes: */
1118    for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) {
1119        pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr);
1120        if (pos == NULL)
1121            return -1;
1122        perf_evlist__add(evsel_list, pos);
1123    }
1124
1125
1126    return 0;
1127}
1128
1129int cmd_stat(int argc, const char **argv, const char *prefix __used)
1130{
1131    struct perf_evsel *pos;
1132    int status = -ENOMEM;
1133
1134    setlocale(LC_ALL, "");
1135
1136    evsel_list = perf_evlist__new(NULL, NULL);
1137    if (evsel_list == NULL)
1138        return -ENOMEM;
1139
1140    argc = parse_options(argc, argv, options, stat_usage,
1141        PARSE_OPT_STOP_AT_NON_OPTION);
1142
1143    if (csv_sep)
1144        csv_output = true;
1145    else
1146        csv_sep = DEFAULT_SEPARATOR;
1147
1148    /*
1149     * let the spreadsheet do the pretty-printing
1150     */
1151    if (csv_output) {
1152        /* User explicitely passed -B? */
1153        if (big_num_opt == 1) {
1154            fprintf(stderr, "-B option not supported with -x\n");
1155            usage_with_options(stat_usage, options);
1156        } else /* Nope, so disable big number formatting */
1157            big_num = false;
1158    } else if (big_num_opt == 0) /* User passed --no-big-num */
1159        big_num = false;
1160
1161    if (!argc && target_pid == -1 && target_tid == -1)
1162        usage_with_options(stat_usage, options);
1163    if (run_count <= 0)
1164        usage_with_options(stat_usage, options);
1165
1166    /* no_aggr, cgroup are for system-wide only */
1167    if ((no_aggr || nr_cgroups) && !system_wide) {
1168        fprintf(stderr, "both cgroup and no-aggregation "
1169            "modes only available in system-wide mode\n");
1170
1171        usage_with_options(stat_usage, options);
1172    }
1173
1174    if (add_default_attributes())
1175        goto out;
1176
1177    if (target_pid != -1)
1178        target_tid = target_pid;
1179
1180    evsel_list->threads = thread_map__new(target_pid, target_tid);
1181    if (evsel_list->threads == NULL) {
1182        pr_err("Problems finding threads of monitor\n");
1183        usage_with_options(stat_usage, options);
1184    }
1185
1186    if (system_wide)
1187        evsel_list->cpus = cpu_map__new(cpu_list);
1188    else
1189        evsel_list->cpus = cpu_map__dummy_new();
1190
1191    if (evsel_list->cpus == NULL) {
1192        perror("failed to parse CPUs map");
1193        usage_with_options(stat_usage, options);
1194        return -1;
1195    }
1196
1197    list_for_each_entry(pos, &evsel_list->entries, node) {
1198        if (perf_evsel__alloc_stat_priv(pos) < 0 ||
1199            perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 ||
1200            perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0)
1201            goto out_free_fd;
1202    }
1203
1204    /*
1205     * We dont want to block the signals - that would cause
1206     * child tasks to inherit that and Ctrl-C would not work.
1207     * What we want is for Ctrl-C to work in the exec()-ed
1208     * task, but being ignored by perf stat itself:
1209     */
1210    atexit(sig_atexit);
1211    signal(SIGINT, skip_signal);
1212    signal(SIGALRM, skip_signal);
1213    signal(SIGABRT, skip_signal);
1214
1215    status = 0;
1216    for (run_idx = 0; run_idx < run_count; run_idx++) {
1217        if (run_count != 1 && verbose)
1218            fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1);
1219
1220        if (sync_run)
1221            sync();
1222
1223        status = run_perf_stat(argc, argv);
1224    }
1225
1226    if (status != -1)
1227        print_stat(argc, argv);
1228out_free_fd:
1229    list_for_each_entry(pos, &evsel_list->entries, node)
1230        perf_evsel__free_stat_priv(pos);
1231    perf_evlist__delete_maps(evsel_list);
1232out:
1233    perf_evlist__delete(evsel_list);
1234    return status;
1235}
1236

Archive Download this file



interactive