Root/tools/perf/builtin-sched.c

1#include "builtin.h"
2#include "perf.h"
3
4#include "util/util.h"
5#include "util/evlist.h"
6#include "util/cache.h"
7#include "util/evsel.h"
8#include "util/symbol.h"
9#include "util/thread.h"
10#include "util/header.h"
11#include "util/session.h"
12#include "util/tool.h"
13
14#include "util/parse-options.h"
15#include "util/trace-event.h"
16
17#include "util/debug.h"
18
19#include <sys/prctl.h>
20#include <sys/resource.h>
21
22#include <semaphore.h>
23#include <pthread.h>
24#include <math.h>
25
26static const char *input_name;
27
28static char default_sort_order[] = "avg, max, switch, runtime";
29static const char *sort_order = default_sort_order;
30
31static int profile_cpu = -1;
32
33#define PR_SET_NAME 15 /* Set process name */
34#define MAX_CPUS 4096
35
36static u64 run_measurement_overhead;
37static u64 sleep_measurement_overhead;
38
39#define COMM_LEN 20
40#define SYM_LEN 129
41
42#define MAX_PID 65536
43
44static unsigned long nr_tasks;
45
46struct sched_atom;
47
48struct task_desc {
49    unsigned long nr;
50    unsigned long pid;
51    char comm[COMM_LEN];
52
53    unsigned long nr_events;
54    unsigned long curr_event;
55    struct sched_atom **atoms;
56
57    pthread_t thread;
58    sem_t sleep_sem;
59
60    sem_t ready_for_work;
61    sem_t work_done_sem;
62
63    u64 cpu_usage;
64};
65
66enum sched_event_type {
67    SCHED_EVENT_RUN,
68    SCHED_EVENT_SLEEP,
69    SCHED_EVENT_WAKEUP,
70    SCHED_EVENT_MIGRATION,
71};
72
73struct sched_atom {
74    enum sched_event_type type;
75    int specific_wait;
76    u64 timestamp;
77    u64 duration;
78    unsigned long nr;
79    sem_t *wait_sem;
80    struct task_desc *wakee;
81};
82
83static struct task_desc *pid_to_task[MAX_PID];
84
85static struct task_desc **tasks;
86
87static pthread_mutex_t start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
88static u64 start_time;
89
90static pthread_mutex_t work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
91
92static unsigned long nr_run_events;
93static unsigned long nr_sleep_events;
94static unsigned long nr_wakeup_events;
95
96static unsigned long nr_sleep_corrections;
97static unsigned long nr_run_events_optimized;
98
99static unsigned long targetless_wakeups;
100static unsigned long multitarget_wakeups;
101
102static u64 cpu_usage;
103static u64 runavg_cpu_usage;
104static u64 parent_cpu_usage;
105static u64 runavg_parent_cpu_usage;
106
107static unsigned long nr_runs;
108static u64 sum_runtime;
109static u64 sum_fluct;
110static u64 run_avg;
111
112static unsigned int replay_repeat = 10;
113static unsigned long nr_timestamps;
114static unsigned long nr_unordered_timestamps;
115static unsigned long nr_state_machine_bugs;
116static unsigned long nr_context_switch_bugs;
117static unsigned long nr_events;
118static unsigned long nr_lost_chunks;
119static unsigned long nr_lost_events;
120
121#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
122
123enum thread_state {
124    THREAD_SLEEPING = 0,
125    THREAD_WAIT_CPU,
126    THREAD_SCHED_IN,
127    THREAD_IGNORE
128};
129
130struct work_atom {
131    struct list_head list;
132    enum thread_state state;
133    u64 sched_out_time;
134    u64 wake_up_time;
135    u64 sched_in_time;
136    u64 runtime;
137};
138
139struct work_atoms {
140    struct list_head work_list;
141    struct thread *thread;
142    struct rb_node node;
143    u64 max_lat;
144    u64 max_lat_at;
145    u64 total_lat;
146    u64 nb_atoms;
147    u64 total_runtime;
148};
149
150typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
151
152static struct rb_root atom_root, sorted_atom_root;
153
154static u64 all_runtime;
155static u64 all_count;
156
157
158static u64 get_nsecs(void)
159{
160    struct timespec ts;
161
162    clock_gettime(CLOCK_MONOTONIC, &ts);
163
164    return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
165}
166
167static void burn_nsecs(u64 nsecs)
168{
169    u64 T0 = get_nsecs(), T1;
170
171    do {
172        T1 = get_nsecs();
173    } while (T1 + run_measurement_overhead < T0 + nsecs);
174}
175
176static void sleep_nsecs(u64 nsecs)
177{
178    struct timespec ts;
179
180    ts.tv_nsec = nsecs % 999999999;
181    ts.tv_sec = nsecs / 999999999;
182
183    nanosleep(&ts, NULL);
184}
185
186static void calibrate_run_measurement_overhead(void)
187{
188    u64 T0, T1, delta, min_delta = 1000000000ULL;
189    int i;
190
191    for (i = 0; i < 10; i++) {
192        T0 = get_nsecs();
193        burn_nsecs(0);
194        T1 = get_nsecs();
195        delta = T1-T0;
196        min_delta = min(min_delta, delta);
197    }
198    run_measurement_overhead = min_delta;
199
200    printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
201}
202
203static void calibrate_sleep_measurement_overhead(void)
204{
205    u64 T0, T1, delta, min_delta = 1000000000ULL;
206    int i;
207
208    for (i = 0; i < 10; i++) {
209        T0 = get_nsecs();
210        sleep_nsecs(10000);
211        T1 = get_nsecs();
212        delta = T1-T0;
213        min_delta = min(min_delta, delta);
214    }
215    min_delta -= 10000;
216    sleep_measurement_overhead = min_delta;
217
218    printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
219}
220
221static struct sched_atom *
222get_new_event(struct task_desc *task, u64 timestamp)
223{
224    struct sched_atom *event = zalloc(sizeof(*event));
225    unsigned long idx = task->nr_events;
226    size_t size;
227
228    event->timestamp = timestamp;
229    event->nr = idx;
230
231    task->nr_events++;
232    size = sizeof(struct sched_atom *) * task->nr_events;
233    task->atoms = realloc(task->atoms, size);
234    BUG_ON(!task->atoms);
235
236    task->atoms[idx] = event;
237
238    return event;
239}
240
241static struct sched_atom *last_event(struct task_desc *task)
242{
243    if (!task->nr_events)
244        return NULL;
245
246    return task->atoms[task->nr_events - 1];
247}
248
249static void
250add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
251{
252    struct sched_atom *event, *curr_event = last_event(task);
253
254    /*
255     * optimize an existing RUN event by merging this one
256     * to it:
257     */
258    if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
259        nr_run_events_optimized++;
260        curr_event->duration += duration;
261        return;
262    }
263
264    event = get_new_event(task, timestamp);
265
266    event->type = SCHED_EVENT_RUN;
267    event->duration = duration;
268
269    nr_run_events++;
270}
271
272static void
273add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
274               struct task_desc *wakee)
275{
276    struct sched_atom *event, *wakee_event;
277
278    event = get_new_event(task, timestamp);
279    event->type = SCHED_EVENT_WAKEUP;
280    event->wakee = wakee;
281
282    wakee_event = last_event(wakee);
283    if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
284        targetless_wakeups++;
285        return;
286    }
287    if (wakee_event->wait_sem) {
288        multitarget_wakeups++;
289        return;
290    }
291
292    wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
293    sem_init(wakee_event->wait_sem, 0, 0);
294    wakee_event->specific_wait = 1;
295    event->wait_sem = wakee_event->wait_sem;
296
297    nr_wakeup_events++;
298}
299
300static void
301add_sched_event_sleep(struct task_desc *task, u64 timestamp,
302              u64 task_state __used)
303{
304    struct sched_atom *event = get_new_event(task, timestamp);
305
306    event->type = SCHED_EVENT_SLEEP;
307
308    nr_sleep_events++;
309}
310
311static struct task_desc *register_pid(unsigned long pid, const char *comm)
312{
313    struct task_desc *task;
314
315    BUG_ON(pid >= MAX_PID);
316
317    task = pid_to_task[pid];
318
319    if (task)
320        return task;
321
322    task = zalloc(sizeof(*task));
323    task->pid = pid;
324    task->nr = nr_tasks;
325    strcpy(task->comm, comm);
326    /*
327     * every task starts in sleeping state - this gets ignored
328     * if there's no wakeup pointing to this sleep state:
329     */
330    add_sched_event_sleep(task, 0, 0);
331
332    pid_to_task[pid] = task;
333    nr_tasks++;
334    tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
335    BUG_ON(!tasks);
336    tasks[task->nr] = task;
337
338    if (verbose)
339        printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
340
341    return task;
342}
343
344
345static void print_task_traces(void)
346{
347    struct task_desc *task;
348    unsigned long i;
349
350    for (i = 0; i < nr_tasks; i++) {
351        task = tasks[i];
352        printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
353            task->nr, task->comm, task->pid, task->nr_events);
354    }
355}
356
357static void add_cross_task_wakeups(void)
358{
359    struct task_desc *task1, *task2;
360    unsigned long i, j;
361
362    for (i = 0; i < nr_tasks; i++) {
363        task1 = tasks[i];
364        j = i + 1;
365        if (j == nr_tasks)
366            j = 0;
367        task2 = tasks[j];
368        add_sched_event_wakeup(task1, 0, task2);
369    }
370}
371
372static void
373process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
374{
375    int ret = 0;
376
377    switch (atom->type) {
378        case SCHED_EVENT_RUN:
379            burn_nsecs(atom->duration);
380            break;
381        case SCHED_EVENT_SLEEP:
382            if (atom->wait_sem)
383                ret = sem_wait(atom->wait_sem);
384            BUG_ON(ret);
385            break;
386        case SCHED_EVENT_WAKEUP:
387            if (atom->wait_sem)
388                ret = sem_post(atom->wait_sem);
389            BUG_ON(ret);
390            break;
391        case SCHED_EVENT_MIGRATION:
392            break;
393        default:
394            BUG_ON(1);
395    }
396}
397
398static u64 get_cpu_usage_nsec_parent(void)
399{
400    struct rusage ru;
401    u64 sum;
402    int err;
403
404    err = getrusage(RUSAGE_SELF, &ru);
405    BUG_ON(err);
406
407    sum = ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
408    sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
409
410    return sum;
411}
412
413static int self_open_counters(void)
414{
415    struct perf_event_attr attr;
416    int fd;
417
418    memset(&attr, 0, sizeof(attr));
419
420    attr.type = PERF_TYPE_SOFTWARE;
421    attr.config = PERF_COUNT_SW_TASK_CLOCK;
422
423    fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
424
425    if (fd < 0)
426        die("Error: sys_perf_event_open() syscall returned"
427            "with %d (%s)\n", fd, strerror(errno));
428    return fd;
429}
430
431static u64 get_cpu_usage_nsec_self(int fd)
432{
433    u64 runtime;
434    int ret;
435
436    ret = read(fd, &runtime, sizeof(runtime));
437    BUG_ON(ret != sizeof(runtime));
438
439    return runtime;
440}
441
442static void *thread_func(void *ctx)
443{
444    struct task_desc *this_task = ctx;
445    u64 cpu_usage_0, cpu_usage_1;
446    unsigned long i, ret;
447    char comm2[22];
448    int fd;
449
450    sprintf(comm2, ":%s", this_task->comm);
451    prctl(PR_SET_NAME, comm2);
452    fd = self_open_counters();
453
454again:
455    ret = sem_post(&this_task->ready_for_work);
456    BUG_ON(ret);
457    ret = pthread_mutex_lock(&start_work_mutex);
458    BUG_ON(ret);
459    ret = pthread_mutex_unlock(&start_work_mutex);
460    BUG_ON(ret);
461
462    cpu_usage_0 = get_cpu_usage_nsec_self(fd);
463
464    for (i = 0; i < this_task->nr_events; i++) {
465        this_task->curr_event = i;
466        process_sched_event(this_task, this_task->atoms[i]);
467    }
468
469    cpu_usage_1 = get_cpu_usage_nsec_self(fd);
470    this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
471    ret = sem_post(&this_task->work_done_sem);
472    BUG_ON(ret);
473
474    ret = pthread_mutex_lock(&work_done_wait_mutex);
475    BUG_ON(ret);
476    ret = pthread_mutex_unlock(&work_done_wait_mutex);
477    BUG_ON(ret);
478
479    goto again;
480}
481
482static void create_tasks(void)
483{
484    struct task_desc *task;
485    pthread_attr_t attr;
486    unsigned long i;
487    int err;
488
489    err = pthread_attr_init(&attr);
490    BUG_ON(err);
491    err = pthread_attr_setstacksize(&attr,
492            (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
493    BUG_ON(err);
494    err = pthread_mutex_lock(&start_work_mutex);
495    BUG_ON(err);
496    err = pthread_mutex_lock(&work_done_wait_mutex);
497    BUG_ON(err);
498    for (i = 0; i < nr_tasks; i++) {
499        task = tasks[i];
500        sem_init(&task->sleep_sem, 0, 0);
501        sem_init(&task->ready_for_work, 0, 0);
502        sem_init(&task->work_done_sem, 0, 0);
503        task->curr_event = 0;
504        err = pthread_create(&task->thread, &attr, thread_func, task);
505        BUG_ON(err);
506    }
507}
508
509static void wait_for_tasks(void)
510{
511    u64 cpu_usage_0, cpu_usage_1;
512    struct task_desc *task;
513    unsigned long i, ret;
514
515    start_time = get_nsecs();
516    cpu_usage = 0;
517    pthread_mutex_unlock(&work_done_wait_mutex);
518
519    for (i = 0; i < nr_tasks; i++) {
520        task = tasks[i];
521        ret = sem_wait(&task->ready_for_work);
522        BUG_ON(ret);
523        sem_init(&task->ready_for_work, 0, 0);
524    }
525    ret = pthread_mutex_lock(&work_done_wait_mutex);
526    BUG_ON(ret);
527
528    cpu_usage_0 = get_cpu_usage_nsec_parent();
529
530    pthread_mutex_unlock(&start_work_mutex);
531
532    for (i = 0; i < nr_tasks; i++) {
533        task = tasks[i];
534        ret = sem_wait(&task->work_done_sem);
535        BUG_ON(ret);
536        sem_init(&task->work_done_sem, 0, 0);
537        cpu_usage += task->cpu_usage;
538        task->cpu_usage = 0;
539    }
540
541    cpu_usage_1 = get_cpu_usage_nsec_parent();
542    if (!runavg_cpu_usage)
543        runavg_cpu_usage = cpu_usage;
544    runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
545
546    parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
547    if (!runavg_parent_cpu_usage)
548        runavg_parent_cpu_usage = parent_cpu_usage;
549    runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
550                   parent_cpu_usage)/10;
551
552    ret = pthread_mutex_lock(&start_work_mutex);
553    BUG_ON(ret);
554
555    for (i = 0; i < nr_tasks; i++) {
556        task = tasks[i];
557        sem_init(&task->sleep_sem, 0, 0);
558        task->curr_event = 0;
559    }
560}
561
562static void run_one_test(void)
563{
564    u64 T0, T1, delta, avg_delta, fluct;
565
566    T0 = get_nsecs();
567    wait_for_tasks();
568    T1 = get_nsecs();
569
570    delta = T1 - T0;
571    sum_runtime += delta;
572    nr_runs++;
573
574    avg_delta = sum_runtime / nr_runs;
575    if (delta < avg_delta)
576        fluct = avg_delta - delta;
577    else
578        fluct = delta - avg_delta;
579    sum_fluct += fluct;
580    if (!run_avg)
581        run_avg = delta;
582    run_avg = (run_avg*9 + delta)/10;
583
584    printf("#%-3ld: %0.3f, ",
585        nr_runs, (double)delta/1000000.0);
586
587    printf("ravg: %0.2f, ",
588        (double)run_avg/1e6);
589
590    printf("cpu: %0.2f / %0.2f",
591        (double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
592
593#if 0
594    /*
595     * rusage statistics done by the parent, these are less
596     * accurate than the sum_exec_runtime based statistics:
597     */
598    printf(" [%0.2f / %0.2f]",
599        (double)parent_cpu_usage/1e6,
600        (double)runavg_parent_cpu_usage/1e6);
601#endif
602
603    printf("\n");
604
605    if (nr_sleep_corrections)
606        printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
607    nr_sleep_corrections = 0;
608}
609
610static void test_calibrations(void)
611{
612    u64 T0, T1;
613
614    T0 = get_nsecs();
615    burn_nsecs(1e6);
616    T1 = get_nsecs();
617
618    printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
619
620    T0 = get_nsecs();
621    sleep_nsecs(1e6);
622    T1 = get_nsecs();
623
624    printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
625}
626
627#define FILL_FIELD(ptr, field, event, data) \
628    ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
629
630#define FILL_ARRAY(ptr, array, event, data) \
631do { \
632    void *__array = raw_field_ptr(event, #array, data); \
633    memcpy(ptr.array, __array, sizeof(ptr.array)); \
634} while(0)
635
636#define FILL_COMMON_FIELDS(ptr, event, data) \
637do { \
638    FILL_FIELD(ptr, common_type, event, data); \
639    FILL_FIELD(ptr, common_flags, event, data); \
640    FILL_FIELD(ptr, common_preempt_count, event, data); \
641    FILL_FIELD(ptr, common_pid, event, data); \
642    FILL_FIELD(ptr, common_tgid, event, data); \
643} while (0)
644
645
646
647struct trace_switch_event {
648    u32 size;
649
650    u16 common_type;
651    u8 common_flags;
652    u8 common_preempt_count;
653    u32 common_pid;
654    u32 common_tgid;
655
656    char prev_comm[16];
657    u32 prev_pid;
658    u32 prev_prio;
659    u64 prev_state;
660    char next_comm[16];
661    u32 next_pid;
662    u32 next_prio;
663};
664
665struct trace_runtime_event {
666    u32 size;
667
668    u16 common_type;
669    u8 common_flags;
670    u8 common_preempt_count;
671    u32 common_pid;
672    u32 common_tgid;
673
674    char comm[16];
675    u32 pid;
676    u64 runtime;
677    u64 vruntime;
678};
679
680struct trace_wakeup_event {
681    u32 size;
682
683    u16 common_type;
684    u8 common_flags;
685    u8 common_preempt_count;
686    u32 common_pid;
687    u32 common_tgid;
688
689    char comm[16];
690    u32 pid;
691
692    u32 prio;
693    u32 success;
694    u32 cpu;
695};
696
697struct trace_fork_event {
698    u32 size;
699
700    u16 common_type;
701    u8 common_flags;
702    u8 common_preempt_count;
703    u32 common_pid;
704    u32 common_tgid;
705
706    char parent_comm[16];
707    u32 parent_pid;
708    char child_comm[16];
709    u32 child_pid;
710};
711
712struct trace_migrate_task_event {
713    u32 size;
714
715    u16 common_type;
716    u8 common_flags;
717    u8 common_preempt_count;
718    u32 common_pid;
719    u32 common_tgid;
720
721    char comm[16];
722    u32 pid;
723
724    u32 prio;
725    u32 cpu;
726};
727
728struct trace_sched_handler {
729    void (*switch_event)(struct trace_switch_event *,
730                 struct machine *,
731                 struct event *,
732                 int cpu,
733                 u64 timestamp,
734                 struct thread *thread);
735
736    void (*runtime_event)(struct trace_runtime_event *,
737                  struct machine *,
738                  struct event *,
739                  int cpu,
740                  u64 timestamp,
741                  struct thread *thread);
742
743    void (*wakeup_event)(struct trace_wakeup_event *,
744                 struct machine *,
745                 struct event *,
746                 int cpu,
747                 u64 timestamp,
748                 struct thread *thread);
749
750    void (*fork_event)(struct trace_fork_event *,
751               struct event *,
752               int cpu,
753               u64 timestamp,
754               struct thread *thread);
755
756    void (*migrate_task_event)(struct trace_migrate_task_event *,
757               struct machine *machine,
758               struct event *,
759               int cpu,
760               u64 timestamp,
761               struct thread *thread);
762};
763
764
765static void
766replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
767            struct machine *machine __used,
768            struct event *event,
769            int cpu __used,
770            u64 timestamp __used,
771            struct thread *thread __used)
772{
773    struct task_desc *waker, *wakee;
774
775    if (verbose) {
776        printf("sched_wakeup event %p\n", event);
777
778        printf(" ... pid %d woke up %s/%d\n",
779            wakeup_event->common_pid,
780            wakeup_event->comm,
781            wakeup_event->pid);
782    }
783
784    waker = register_pid(wakeup_event->common_pid, "<unknown>");
785    wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
786
787    add_sched_event_wakeup(waker, timestamp, wakee);
788}
789
790static u64 cpu_last_switched[MAX_CPUS];
791
792static void
793replay_switch_event(struct trace_switch_event *switch_event,
794            struct machine *machine __used,
795            struct event *event,
796            int cpu,
797            u64 timestamp,
798            struct thread *thread __used)
799{
800    struct task_desc *prev, __used *next;
801    u64 timestamp0;
802    s64 delta;
803
804    if (verbose)
805        printf("sched_switch event %p\n", event);
806
807    if (cpu >= MAX_CPUS || cpu < 0)
808        return;
809
810    timestamp0 = cpu_last_switched[cpu];
811    if (timestamp0)
812        delta = timestamp - timestamp0;
813    else
814        delta = 0;
815
816    if (delta < 0)
817        die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
818
819    if (verbose) {
820        printf(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
821            switch_event->prev_comm, switch_event->prev_pid,
822            switch_event->next_comm, switch_event->next_pid,
823            delta);
824    }
825
826    prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
827    next = register_pid(switch_event->next_pid, switch_event->next_comm);
828
829    cpu_last_switched[cpu] = timestamp;
830
831    add_sched_event_run(prev, timestamp, delta);
832    add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
833}
834
835
836static void
837replay_fork_event(struct trace_fork_event *fork_event,
838          struct event *event,
839          int cpu __used,
840          u64 timestamp __used,
841          struct thread *thread __used)
842{
843    if (verbose) {
844        printf("sched_fork event %p\n", event);
845        printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
846        printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
847    }
848    register_pid(fork_event->parent_pid, fork_event->parent_comm);
849    register_pid(fork_event->child_pid, fork_event->child_comm);
850}
851
852static struct trace_sched_handler replay_ops = {
853    .wakeup_event = replay_wakeup_event,
854    .switch_event = replay_switch_event,
855    .fork_event = replay_fork_event,
856};
857
858struct sort_dimension {
859    const char *name;
860    sort_fn_t cmp;
861    struct list_head list;
862};
863
864static LIST_HEAD(cmp_pid);
865
866static int
867thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
868{
869    struct sort_dimension *sort;
870    int ret = 0;
871
872    BUG_ON(list_empty(list));
873
874    list_for_each_entry(sort, list, list) {
875        ret = sort->cmp(l, r);
876        if (ret)
877            return ret;
878    }
879
880    return ret;
881}
882
883static struct work_atoms *
884thread_atoms_search(struct rb_root *root, struct thread *thread,
885             struct list_head *sort_list)
886{
887    struct rb_node *node = root->rb_node;
888    struct work_atoms key = { .thread = thread };
889
890    while (node) {
891        struct work_atoms *atoms;
892        int cmp;
893
894        atoms = container_of(node, struct work_atoms, node);
895
896        cmp = thread_lat_cmp(sort_list, &key, atoms);
897        if (cmp > 0)
898            node = node->rb_left;
899        else if (cmp < 0)
900            node = node->rb_right;
901        else {
902            BUG_ON(thread != atoms->thread);
903            return atoms;
904        }
905    }
906    return NULL;
907}
908
909static void
910__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
911             struct list_head *sort_list)
912{
913    struct rb_node **new = &(root->rb_node), *parent = NULL;
914
915    while (*new) {
916        struct work_atoms *this;
917        int cmp;
918
919        this = container_of(*new, struct work_atoms, node);
920        parent = *new;
921
922        cmp = thread_lat_cmp(sort_list, data, this);
923
924        if (cmp > 0)
925            new = &((*new)->rb_left);
926        else
927            new = &((*new)->rb_right);
928    }
929
930    rb_link_node(&data->node, parent, new);
931    rb_insert_color(&data->node, root);
932}
933
934static void thread_atoms_insert(struct thread *thread)
935{
936    struct work_atoms *atoms = zalloc(sizeof(*atoms));
937    if (!atoms)
938        die("No memory");
939
940    atoms->thread = thread;
941    INIT_LIST_HEAD(&atoms->work_list);
942    __thread_latency_insert(&atom_root, atoms, &cmp_pid);
943}
944
945static void
946latency_fork_event(struct trace_fork_event *fork_event __used,
947           struct event *event __used,
948           int cpu __used,
949           u64 timestamp __used,
950           struct thread *thread __used)
951{
952    /* should insert the newcomer */
953}
954
955__used
956static char sched_out_state(struct trace_switch_event *switch_event)
957{
958    const char *str = TASK_STATE_TO_CHAR_STR;
959
960    return str[switch_event->prev_state];
961}
962
963static void
964add_sched_out_event(struct work_atoms *atoms,
965            char run_state,
966            u64 timestamp)
967{
968    struct work_atom *atom = zalloc(sizeof(*atom));
969    if (!atom)
970        die("Non memory");
971
972    atom->sched_out_time = timestamp;
973
974    if (run_state == 'R') {
975        atom->state = THREAD_WAIT_CPU;
976        atom->wake_up_time = atom->sched_out_time;
977    }
978
979    list_add_tail(&atom->list, &atoms->work_list);
980}
981
982static void
983add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
984{
985    struct work_atom *atom;
986
987    BUG_ON(list_empty(&atoms->work_list));
988
989    atom = list_entry(atoms->work_list.prev, struct work_atom, list);
990
991    atom->runtime += delta;
992    atoms->total_runtime += delta;
993}
994
995static void
996add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
997{
998    struct work_atom *atom;
999    u64 delta;
1000
1001    if (list_empty(&atoms->work_list))
1002        return;
1003
1004    atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1005
1006    if (atom->state != THREAD_WAIT_CPU)
1007        return;
1008
1009    if (timestamp < atom->wake_up_time) {
1010        atom->state = THREAD_IGNORE;
1011        return;
1012    }
1013
1014    atom->state = THREAD_SCHED_IN;
1015    atom->sched_in_time = timestamp;
1016
1017    delta = atom->sched_in_time - atom->wake_up_time;
1018    atoms->total_lat += delta;
1019    if (delta > atoms->max_lat) {
1020        atoms->max_lat = delta;
1021        atoms->max_lat_at = timestamp;
1022    }
1023    atoms->nb_atoms++;
1024}
1025
1026static void
1027latency_switch_event(struct trace_switch_event *switch_event,
1028             struct machine *machine,
1029             struct event *event __used,
1030             int cpu,
1031             u64 timestamp,
1032             struct thread *thread __used)
1033{
1034    struct work_atoms *out_events, *in_events;
1035    struct thread *sched_out, *sched_in;
1036    u64 timestamp0;
1037    s64 delta;
1038
1039    BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1040
1041    timestamp0 = cpu_last_switched[cpu];
1042    cpu_last_switched[cpu] = timestamp;
1043    if (timestamp0)
1044        delta = timestamp - timestamp0;
1045    else
1046        delta = 0;
1047
1048    if (delta < 0)
1049        die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1050
1051
1052    sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1053    sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1054
1055    out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1056    if (!out_events) {
1057        thread_atoms_insert(sched_out);
1058        out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1059        if (!out_events)
1060            die("out-event: Internal tree error");
1061    }
1062    add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
1063
1064    in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1065    if (!in_events) {
1066        thread_atoms_insert(sched_in);
1067        in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1068        if (!in_events)
1069            die("in-event: Internal tree error");
1070        /*
1071         * Take came in we have not heard about yet,
1072         * add in an initial atom in runnable state:
1073         */
1074        add_sched_out_event(in_events, 'R', timestamp);
1075    }
1076    add_sched_in_event(in_events, timestamp);
1077}
1078
1079static void
1080latency_runtime_event(struct trace_runtime_event *runtime_event,
1081             struct machine *machine,
1082             struct event *event __used,
1083             int cpu,
1084             u64 timestamp,
1085             struct thread *this_thread __used)
1086{
1087    struct thread *thread = machine__findnew_thread(machine, runtime_event->pid);
1088    struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1089
1090    BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1091    if (!atoms) {
1092        thread_atoms_insert(thread);
1093        atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1094        if (!atoms)
1095            die("in-event: Internal tree error");
1096        add_sched_out_event(atoms, 'R', timestamp);
1097    }
1098
1099    add_runtime_event(atoms, runtime_event->runtime, timestamp);
1100}
1101
1102static void
1103latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1104             struct machine *machine,
1105             struct event *__event __used,
1106             int cpu __used,
1107             u64 timestamp,
1108             struct thread *thread __used)
1109{
1110    struct work_atoms *atoms;
1111    struct work_atom *atom;
1112    struct thread *wakee;
1113
1114    /* Note for later, it may be interesting to observe the failing cases */
1115    if (!wakeup_event->success)
1116        return;
1117
1118    wakee = machine__findnew_thread(machine, wakeup_event->pid);
1119    atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1120    if (!atoms) {
1121        thread_atoms_insert(wakee);
1122        atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1123        if (!atoms)
1124            die("wakeup-event: Internal tree error");
1125        add_sched_out_event(atoms, 'S', timestamp);
1126    }
1127
1128    BUG_ON(list_empty(&atoms->work_list));
1129
1130    atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1131
1132    /*
1133     * You WILL be missing events if you've recorded only
1134     * one CPU, or are only looking at only one, so don't
1135     * make useless noise.
1136     */
1137    if (profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1138        nr_state_machine_bugs++;
1139
1140    nr_timestamps++;
1141    if (atom->sched_out_time > timestamp) {
1142        nr_unordered_timestamps++;
1143        return;
1144    }
1145
1146    atom->state = THREAD_WAIT_CPU;
1147    atom->wake_up_time = timestamp;
1148}
1149
1150static void
1151latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
1152             struct machine *machine,
1153             struct event *__event __used,
1154             int cpu __used,
1155             u64 timestamp,
1156             struct thread *thread __used)
1157{
1158    struct work_atoms *atoms;
1159    struct work_atom *atom;
1160    struct thread *migrant;
1161
1162    /*
1163     * Only need to worry about migration when profiling one CPU.
1164     */
1165    if (profile_cpu == -1)
1166        return;
1167
1168    migrant = machine__findnew_thread(machine, migrate_task_event->pid);
1169    atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1170    if (!atoms) {
1171        thread_atoms_insert(migrant);
1172        register_pid(migrant->pid, migrant->comm);
1173        atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
1174        if (!atoms)
1175            die("migration-event: Internal tree error");
1176        add_sched_out_event(atoms, 'R', timestamp);
1177    }
1178
1179    BUG_ON(list_empty(&atoms->work_list));
1180
1181    atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1182    atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1183
1184    nr_timestamps++;
1185
1186    if (atom->sched_out_time > timestamp)
1187        nr_unordered_timestamps++;
1188}
1189
1190static struct trace_sched_handler lat_ops = {
1191    .wakeup_event = latency_wakeup_event,
1192    .switch_event = latency_switch_event,
1193    .runtime_event = latency_runtime_event,
1194    .fork_event = latency_fork_event,
1195    .migrate_task_event = latency_migrate_task_event,
1196};
1197
1198static void output_lat_thread(struct work_atoms *work_list)
1199{
1200    int i;
1201    int ret;
1202    u64 avg;
1203
1204    if (!work_list->nb_atoms)
1205        return;
1206    /*
1207     * Ignore idle threads:
1208     */
1209    if (!strcmp(work_list->thread->comm, "swapper"))
1210        return;
1211
1212    all_runtime += work_list->total_runtime;
1213    all_count += work_list->nb_atoms;
1214
1215    ret = printf(" %s:%d ", work_list->thread->comm, work_list->thread->pid);
1216
1217    for (i = 0; i < 24 - ret; i++)
1218        printf(" ");
1219
1220    avg = work_list->total_lat / work_list->nb_atoms;
1221
1222    printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1223          (double)work_list->total_runtime / 1e6,
1224         work_list->nb_atoms, (double)avg / 1e6,
1225         (double)work_list->max_lat / 1e6,
1226         (double)work_list->max_lat_at / 1e9);
1227}
1228
1229static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1230{
1231    if (l->thread->pid < r->thread->pid)
1232        return -1;
1233    if (l->thread->pid > r->thread->pid)
1234        return 1;
1235
1236    return 0;
1237}
1238
1239static struct sort_dimension pid_sort_dimension = {
1240    .name = "pid",
1241    .cmp = pid_cmp,
1242};
1243
1244static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1245{
1246    u64 avgl, avgr;
1247
1248    if (!l->nb_atoms)
1249        return -1;
1250
1251    if (!r->nb_atoms)
1252        return 1;
1253
1254    avgl = l->total_lat / l->nb_atoms;
1255    avgr = r->total_lat / r->nb_atoms;
1256
1257    if (avgl < avgr)
1258        return -1;
1259    if (avgl > avgr)
1260        return 1;
1261
1262    return 0;
1263}
1264
1265static struct sort_dimension avg_sort_dimension = {
1266    .name = "avg",
1267    .cmp = avg_cmp,
1268};
1269
1270static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1271{
1272    if (l->max_lat < r->max_lat)
1273        return -1;
1274    if (l->max_lat > r->max_lat)
1275        return 1;
1276
1277    return 0;
1278}
1279
1280static struct sort_dimension max_sort_dimension = {
1281    .name = "max",
1282    .cmp = max_cmp,
1283};
1284
1285static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1286{
1287    if (l->nb_atoms < r->nb_atoms)
1288        return -1;
1289    if (l->nb_atoms > r->nb_atoms)
1290        return 1;
1291
1292    return 0;
1293}
1294
1295static struct sort_dimension switch_sort_dimension = {
1296    .name = "switch",
1297    .cmp = switch_cmp,
1298};
1299
1300static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1301{
1302    if (l->total_runtime < r->total_runtime)
1303        return -1;
1304    if (l->total_runtime > r->total_runtime)
1305        return 1;
1306
1307    return 0;
1308}
1309
1310static struct sort_dimension runtime_sort_dimension = {
1311    .name = "runtime",
1312    .cmp = runtime_cmp,
1313};
1314
1315static struct sort_dimension *available_sorts[] = {
1316    &pid_sort_dimension,
1317    &avg_sort_dimension,
1318    &max_sort_dimension,
1319    &switch_sort_dimension,
1320    &runtime_sort_dimension,
1321};
1322
1323#define NB_AVAILABLE_SORTS (int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
1324
1325static LIST_HEAD(sort_list);
1326
1327static int sort_dimension__add(const char *tok, struct list_head *list)
1328{
1329    int i;
1330
1331    for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
1332        if (!strcmp(available_sorts[i]->name, tok)) {
1333            list_add_tail(&available_sorts[i]->list, list);
1334
1335            return 0;
1336        }
1337    }
1338
1339    return -1;
1340}
1341
1342static void setup_sorting(void);
1343
1344static void sort_lat(void)
1345{
1346    struct rb_node *node;
1347
1348    for (;;) {
1349        struct work_atoms *data;
1350        node = rb_first(&atom_root);
1351        if (!node)
1352            break;
1353
1354        rb_erase(node, &atom_root);
1355        data = rb_entry(node, struct work_atoms, node);
1356        __thread_latency_insert(&sorted_atom_root, data, &sort_list);
1357    }
1358}
1359
1360static struct trace_sched_handler *trace_handler;
1361
1362static void
1363process_sched_wakeup_event(struct perf_tool *tool __used,
1364               struct event *event,
1365               struct perf_sample *sample,
1366               struct machine *machine,
1367               struct thread *thread)
1368{
1369    void *data = sample->raw_data;
1370    struct trace_wakeup_event wakeup_event;
1371
1372    FILL_COMMON_FIELDS(wakeup_event, event, data);
1373
1374    FILL_ARRAY(wakeup_event, comm, event, data);
1375    FILL_FIELD(wakeup_event, pid, event, data);
1376    FILL_FIELD(wakeup_event, prio, event, data);
1377    FILL_FIELD(wakeup_event, success, event, data);
1378    FILL_FIELD(wakeup_event, cpu, event, data);
1379
1380    if (trace_handler->wakeup_event)
1381        trace_handler->wakeup_event(&wakeup_event, machine, event,
1382                        sample->cpu, sample->time, thread);
1383}
1384
1385/*
1386 * Track the current task - that way we can know whether there's any
1387 * weird events, such as a task being switched away that is not current.
1388 */
1389static int max_cpu;
1390
1391static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
1392
1393static struct thread *curr_thread[MAX_CPUS];
1394
1395static char next_shortname1 = 'A';
1396static char next_shortname2 = '0';
1397
1398static void
1399map_switch_event(struct trace_switch_event *switch_event,
1400         struct machine *machine,
1401         struct event *event __used,
1402         int this_cpu,
1403         u64 timestamp,
1404         struct thread *thread __used)
1405{
1406    struct thread *sched_out __used, *sched_in;
1407    int new_shortname;
1408    u64 timestamp0;
1409    s64 delta;
1410    int cpu;
1411
1412    BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1413
1414    if (this_cpu > max_cpu)
1415        max_cpu = this_cpu;
1416
1417    timestamp0 = cpu_last_switched[this_cpu];
1418    cpu_last_switched[this_cpu] = timestamp;
1419    if (timestamp0)
1420        delta = timestamp - timestamp0;
1421    else
1422        delta = 0;
1423
1424    if (delta < 0)
1425        die("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1426
1427
1428    sched_out = machine__findnew_thread(machine, switch_event->prev_pid);
1429    sched_in = machine__findnew_thread(machine, switch_event->next_pid);
1430
1431    curr_thread[this_cpu] = sched_in;
1432
1433    printf(" ");
1434
1435    new_shortname = 0;
1436    if (!sched_in->shortname[0]) {
1437        sched_in->shortname[0] = next_shortname1;
1438        sched_in->shortname[1] = next_shortname2;
1439
1440        if (next_shortname1 < 'Z') {
1441            next_shortname1++;
1442        } else {
1443            next_shortname1='A';
1444            if (next_shortname2 < '9') {
1445                next_shortname2++;
1446            } else {
1447                next_shortname2='0';
1448            }
1449        }
1450        new_shortname = 1;
1451    }
1452
1453    for (cpu = 0; cpu <= max_cpu; cpu++) {
1454        if (cpu != this_cpu)
1455            printf(" ");
1456        else
1457            printf("*");
1458
1459        if (curr_thread[cpu]) {
1460            if (curr_thread[cpu]->pid)
1461                printf("%2s ", curr_thread[cpu]->shortname);
1462            else
1463                printf(". ");
1464        } else
1465            printf(" ");
1466    }
1467
1468    printf(" %12.6f secs ", (double)timestamp/1e9);
1469    if (new_shortname) {
1470        printf("%s => %s:%d\n",
1471            sched_in->shortname, sched_in->comm, sched_in->pid);
1472    } else {
1473        printf("\n");
1474    }
1475}
1476
1477static void
1478process_sched_switch_event(struct perf_tool *tool __used,
1479               struct event *event,
1480               struct perf_sample *sample,
1481               struct machine *machine,
1482               struct thread *thread)
1483{
1484    int this_cpu = sample->cpu;
1485    void *data = sample->raw_data;
1486    struct trace_switch_event switch_event;
1487
1488    FILL_COMMON_FIELDS(switch_event, event, data);
1489
1490    FILL_ARRAY(switch_event, prev_comm, event, data);
1491    FILL_FIELD(switch_event, prev_pid, event, data);
1492    FILL_FIELD(switch_event, prev_prio, event, data);
1493    FILL_FIELD(switch_event, prev_state, event, data);
1494    FILL_ARRAY(switch_event, next_comm, event, data);
1495    FILL_FIELD(switch_event, next_pid, event, data);
1496    FILL_FIELD(switch_event, next_prio, event, data);
1497
1498    if (curr_pid[this_cpu] != (u32)-1) {
1499        /*
1500         * Are we trying to switch away a PID that is
1501         * not current?
1502         */
1503        if (curr_pid[this_cpu] != switch_event.prev_pid)
1504            nr_context_switch_bugs++;
1505    }
1506    if (trace_handler->switch_event)
1507        trace_handler->switch_event(&switch_event, machine, event,
1508                        this_cpu, sample->time, thread);
1509
1510    curr_pid[this_cpu] = switch_event.next_pid;
1511}
1512
1513static void
1514process_sched_runtime_event(struct perf_tool *tool __used,
1515                struct event *event,
1516                struct perf_sample *sample,
1517                struct machine *machine,
1518                struct thread *thread)
1519{
1520    void *data = sample->raw_data;
1521    struct trace_runtime_event runtime_event;
1522
1523    FILL_ARRAY(runtime_event, comm, event, data);
1524    FILL_FIELD(runtime_event, pid, event, data);
1525    FILL_FIELD(runtime_event, runtime, event, data);
1526    FILL_FIELD(runtime_event, vruntime, event, data);
1527
1528    if (trace_handler->runtime_event)
1529        trace_handler->runtime_event(&runtime_event, machine, event,
1530                         sample->cpu, sample->time, thread);
1531}
1532
1533static void
1534process_sched_fork_event(struct perf_tool *tool __used,
1535             struct event *event,
1536             struct perf_sample *sample,
1537             struct machine *machine __used,
1538             struct thread *thread)
1539{
1540    void *data = sample->raw_data;
1541    struct trace_fork_event fork_event;
1542
1543    FILL_COMMON_FIELDS(fork_event, event, data);
1544
1545    FILL_ARRAY(fork_event, parent_comm, event, data);
1546    FILL_FIELD(fork_event, parent_pid, event, data);
1547    FILL_ARRAY(fork_event, child_comm, event, data);
1548    FILL_FIELD(fork_event, child_pid, event, data);
1549
1550    if (trace_handler->fork_event)
1551        trace_handler->fork_event(&fork_event, event,
1552                      sample->cpu, sample->time, thread);
1553}
1554
1555static void
1556process_sched_exit_event(struct perf_tool *tool __used,
1557             struct event *event,
1558             struct perf_sample *sample __used,
1559             struct machine *machine __used,
1560             struct thread *thread __used)
1561{
1562    if (verbose)
1563        printf("sched_exit event %p\n", event);
1564}
1565
1566static void
1567process_sched_migrate_task_event(struct perf_tool *tool __used,
1568                 struct event *event,
1569                 struct perf_sample *sample,
1570                 struct machine *machine,
1571                 struct thread *thread)
1572{
1573    void *data = sample->raw_data;
1574    struct trace_migrate_task_event migrate_task_event;
1575
1576    FILL_COMMON_FIELDS(migrate_task_event, event, data);
1577
1578    FILL_ARRAY(migrate_task_event, comm, event, data);
1579    FILL_FIELD(migrate_task_event, pid, event, data);
1580    FILL_FIELD(migrate_task_event, prio, event, data);
1581    FILL_FIELD(migrate_task_event, cpu, event, data);
1582
1583    if (trace_handler->migrate_task_event)
1584        trace_handler->migrate_task_event(&migrate_task_event, machine,
1585                          event, sample->cpu,
1586                          sample->time, thread);
1587}
1588
1589typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event *event,
1590                   struct perf_sample *sample,
1591                   struct machine *machine,
1592                   struct thread *thread);
1593
1594static int perf_sched__process_tracepoint_sample(struct perf_tool *tool,
1595                         union perf_event *event __used,
1596                         struct perf_sample *sample,
1597                         struct perf_evsel *evsel,
1598                         struct machine *machine)
1599{
1600    struct thread *thread = machine__findnew_thread(machine, sample->pid);
1601
1602    if (thread == NULL) {
1603        pr_debug("problem processing %s event, skipping it.\n",
1604             evsel->name);
1605        return -1;
1606    }
1607
1608    evsel->hists.stats.total_period += sample->period;
1609    hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1610
1611    if (evsel->handler.func != NULL) {
1612        tracepoint_handler f = evsel->handler.func;
1613
1614        if (evsel->handler.data == NULL)
1615            evsel->handler.data = trace_find_event(evsel->attr.config);
1616
1617        f(tool, evsel->handler.data, sample, machine, thread);
1618    }
1619
1620    return 0;
1621}
1622
1623static struct perf_tool perf_sched = {
1624    .sample = perf_sched__process_tracepoint_sample,
1625    .comm = perf_event__process_comm,
1626    .lost = perf_event__process_lost,
1627    .fork = perf_event__process_task,
1628    .ordered_samples = true,
1629};
1630
1631static void read_events(bool destroy, struct perf_session **psession)
1632{
1633    int err = -EINVAL;
1634    const struct perf_evsel_str_handler handlers[] = {
1635        { "sched:sched_switch", process_sched_switch_event, },
1636        { "sched:sched_stat_runtime", process_sched_runtime_event, },
1637        { "sched:sched_wakeup", process_sched_wakeup_event, },
1638        { "sched:sched_wakeup_new", process_sched_wakeup_event, },
1639        { "sched:sched_process_fork", process_sched_fork_event, },
1640        { "sched:sched_process_exit", process_sched_exit_event, },
1641        { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1642    };
1643    struct perf_session *session = perf_session__new(input_name, O_RDONLY,
1644                             0, false, &perf_sched);
1645    if (session == NULL)
1646        die("No Memory");
1647
1648    err = perf_evlist__set_tracepoints_handlers_array(session->evlist, handlers);
1649    assert(err == 0);
1650
1651    if (perf_session__has_traces(session, "record -R")) {
1652        err = perf_session__process_events(session, &perf_sched);
1653        if (err)
1654            die("Failed to process events, error %d", err);
1655
1656        nr_events = session->hists.stats.nr_events[0];
1657        nr_lost_events = session->hists.stats.total_lost;
1658        nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
1659    }
1660
1661    if (destroy)
1662        perf_session__delete(session);
1663
1664    if (psession)
1665        *psession = session;
1666}
1667
1668static void print_bad_events(void)
1669{
1670    if (nr_unordered_timestamps && nr_timestamps) {
1671        printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1672            (double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
1673            nr_unordered_timestamps, nr_timestamps);
1674    }
1675    if (nr_lost_events && nr_events) {
1676        printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1677            (double)nr_lost_events/(double)nr_events*100.0,
1678            nr_lost_events, nr_events, nr_lost_chunks);
1679    }
1680    if (nr_state_machine_bugs && nr_timestamps) {
1681        printf(" INFO: %.3f%% state machine bugs (%ld out of %ld)",
1682            (double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
1683            nr_state_machine_bugs, nr_timestamps);
1684        if (nr_lost_events)
1685            printf(" (due to lost events?)");
1686        printf("\n");
1687    }
1688    if (nr_context_switch_bugs && nr_timestamps) {
1689        printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)",
1690            (double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
1691            nr_context_switch_bugs, nr_timestamps);
1692        if (nr_lost_events)
1693            printf(" (due to lost events?)");
1694        printf("\n");
1695    }
1696}
1697
1698static void __cmd_lat(void)
1699{
1700    struct rb_node *next;
1701    struct perf_session *session;
1702
1703    setup_pager();
1704    read_events(false, &session);
1705    sort_lat();
1706
1707    printf("\n ---------------------------------------------------------------------------------------------------------------\n");
1708    printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n");
1709    printf(" ---------------------------------------------------------------------------------------------------------------\n");
1710
1711    next = rb_first(&sorted_atom_root);
1712
1713    while (next) {
1714        struct work_atoms *work_list;
1715
1716        work_list = rb_entry(next, struct work_atoms, node);
1717        output_lat_thread(work_list);
1718        next = rb_next(next);
1719    }
1720
1721    printf(" -----------------------------------------------------------------------------------------\n");
1722    printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n",
1723        (double)all_runtime/1e6, all_count);
1724
1725    printf(" ---------------------------------------------------\n");
1726
1727    print_bad_events();
1728    printf("\n");
1729
1730    perf_session__delete(session);
1731}
1732
1733static struct trace_sched_handler map_ops = {
1734    .wakeup_event = NULL,
1735    .switch_event = map_switch_event,
1736    .runtime_event = NULL,
1737    .fork_event = NULL,
1738};
1739
1740static void __cmd_map(void)
1741{
1742    max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1743
1744    setup_pager();
1745    read_events(true, NULL);
1746    print_bad_events();
1747}
1748
1749static void __cmd_replay(void)
1750{
1751    unsigned long i;
1752
1753    calibrate_run_measurement_overhead();
1754    calibrate_sleep_measurement_overhead();
1755
1756    test_calibrations();
1757
1758    read_events(true, NULL);
1759
1760    printf("nr_run_events: %ld\n", nr_run_events);
1761    printf("nr_sleep_events: %ld\n", nr_sleep_events);
1762    printf("nr_wakeup_events: %ld\n", nr_wakeup_events);
1763
1764    if (targetless_wakeups)
1765        printf("target-less wakeups: %ld\n", targetless_wakeups);
1766    if (multitarget_wakeups)
1767        printf("multi-target wakeups: %ld\n", multitarget_wakeups);
1768    if (nr_run_events_optimized)
1769        printf("run atoms optimized: %ld\n",
1770            nr_run_events_optimized);
1771
1772    print_task_traces();
1773    add_cross_task_wakeups();
1774
1775    create_tasks();
1776    printf("------------------------------------------------------------\n");
1777    for (i = 0; i < replay_repeat; i++)
1778        run_one_test();
1779}
1780
1781
1782static const char * const sched_usage[] = {
1783    "perf sched [<options>] {record|latency|map|replay|script}",
1784    NULL
1785};
1786
1787static const struct option sched_options[] = {
1788    OPT_STRING('i', "input", &input_name, "file",
1789            "input file name"),
1790    OPT_INCR('v', "verbose", &verbose,
1791            "be more verbose (show symbol address, etc)"),
1792    OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1793            "dump raw trace in ASCII"),
1794    OPT_END()
1795};
1796
1797static const char * const latency_usage[] = {
1798    "perf sched latency [<options>]",
1799    NULL
1800};
1801
1802static const struct option latency_options[] = {
1803    OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1804           "sort by key(s): runtime, switch, avg, max"),
1805    OPT_INCR('v', "verbose", &verbose,
1806            "be more verbose (show symbol address, etc)"),
1807    OPT_INTEGER('C', "CPU", &profile_cpu,
1808            "CPU to profile on"),
1809    OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1810            "dump raw trace in ASCII"),
1811    OPT_END()
1812};
1813
1814static const char * const replay_usage[] = {
1815    "perf sched replay [<options>]",
1816    NULL
1817};
1818
1819static const struct option replay_options[] = {
1820    OPT_UINTEGER('r', "repeat", &replay_repeat,
1821             "repeat the workload replay N times (-1: infinite)"),
1822    OPT_INCR('v', "verbose", &verbose,
1823            "be more verbose (show symbol address, etc)"),
1824    OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1825            "dump raw trace in ASCII"),
1826    OPT_END()
1827};
1828
1829static void setup_sorting(void)
1830{
1831    char *tmp, *tok, *str = strdup(sort_order);
1832
1833    for (tok = strtok_r(str, ", ", &tmp);
1834            tok; tok = strtok_r(NULL, ", ", &tmp)) {
1835        if (sort_dimension__add(tok, &sort_list) < 0) {
1836            error("Unknown --sort key: `%s'", tok);
1837            usage_with_options(latency_usage, latency_options);
1838        }
1839    }
1840
1841    free(str);
1842
1843    sort_dimension__add("pid", &cmp_pid);
1844}
1845
1846static const char *record_args[] = {
1847    "record",
1848    "-a",
1849    "-R",
1850    "-f",
1851    "-m", "1024",
1852    "-c", "1",
1853    "-e", "sched:sched_switch",
1854    "-e", "sched:sched_stat_wait",
1855    "-e", "sched:sched_stat_sleep",
1856    "-e", "sched:sched_stat_iowait",
1857    "-e", "sched:sched_stat_runtime",
1858    "-e", "sched:sched_process_exit",
1859    "-e", "sched:sched_process_fork",
1860    "-e", "sched:sched_wakeup",
1861    "-e", "sched:sched_migrate_task",
1862};
1863
1864static int __cmd_record(int argc, const char **argv)
1865{
1866    unsigned int rec_argc, i, j;
1867    const char **rec_argv;
1868
1869    rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1870    rec_argv = calloc(rec_argc + 1, sizeof(char *));
1871
1872    if (rec_argv == NULL)
1873        return -ENOMEM;
1874
1875    for (i = 0; i < ARRAY_SIZE(record_args); i++)
1876        rec_argv[i] = strdup(record_args[i]);
1877
1878    for (j = 1; j < (unsigned int)argc; j++, i++)
1879        rec_argv[i] = argv[j];
1880
1881    BUG_ON(i != rec_argc);
1882
1883    return cmd_record(i, rec_argv, NULL);
1884}
1885
1886int cmd_sched(int argc, const char **argv, const char *prefix __used)
1887{
1888    argc = parse_options(argc, argv, sched_options, sched_usage,
1889                 PARSE_OPT_STOP_AT_NON_OPTION);
1890    if (!argc)
1891        usage_with_options(sched_usage, sched_options);
1892
1893    /*
1894     * Aliased to 'perf script' for now:
1895     */
1896    if (!strcmp(argv[0], "script"))
1897        return cmd_script(argc, argv, prefix);
1898
1899    symbol__init();
1900    if (!strncmp(argv[0], "rec", 3)) {
1901        return __cmd_record(argc, argv);
1902    } else if (!strncmp(argv[0], "lat", 3)) {
1903        trace_handler = &lat_ops;
1904        if (argc > 1) {
1905            argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1906            if (argc)
1907                usage_with_options(latency_usage, latency_options);
1908        }
1909        setup_sorting();
1910        __cmd_lat();
1911    } else if (!strcmp(argv[0], "map")) {
1912        trace_handler = &map_ops;
1913        setup_sorting();
1914        __cmd_map();
1915    } else if (!strncmp(argv[0], "rep", 3)) {
1916        trace_handler = &replay_ops;
1917        if (argc) {
1918            argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1919            if (argc)
1920                usage_with_options(replay_usage, replay_options);
1921        }
1922        __cmd_replay();
1923    } else {
1924        usage_with_options(sched_usage, sched_options);
1925    }
1926
1927    return 0;
1928}
1929

Archive Download this file



interactive